Merge branch 'block-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into xfs-6.16-merge

Merging block tree into XFS because of some dependencies like
bdev_validate_blocksize()

Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
Carlos Maiolino
2025-05-14 12:20:57 +02:00
53 changed files with 1643 additions and 698 deletions

View File

@@ -152,27 +152,65 @@ static void set_init_blocksize(struct block_device *bdev)
get_order(bsize));
}
/**
* bdev_validate_blocksize - check that this block size is acceptable
* @bdev: blockdevice to check
* @block_size: block size to check
*
* For block device users that do not use buffer heads or the block device
* page cache, make sure that this block size can be used with the device.
*
* Return: On success zero is returned, negative error code on failure.
*/
int bdev_validate_blocksize(struct block_device *bdev, int block_size)
{
if (blk_validate_block_size(block_size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
if (block_size < bdev_logical_block_size(bdev))
return -EINVAL;
return 0;
}
EXPORT_SYMBOL_GPL(bdev_validate_blocksize);
int set_blocksize(struct file *file, int size)
{
struct inode *inode = file->f_mapping->host;
struct block_device *bdev = I_BDEV(inode);
int ret;
if (blk_validate_block_size(size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
ret = bdev_validate_blocksize(bdev, size);
if (ret)
return ret;
if (!file->private_data)
return -EINVAL;
/* Don't change the size if it is same as current */
if (inode->i_blkbits != blksize_bits(size)) {
/*
* Flush and truncate the pagecache before we reconfigure the
* mapping geometry because folio sizes are variable now. If a
* reader has already allocated a folio whose size is smaller
* than the new min_order but invokes readahead after the new
* min_order becomes visible, readahead will think there are
* "zero" blocks per folio and crash. Take the inode and
* invalidation locks to avoid racing with
* read/write/fallocate.
*/
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
sync_blockdev(bdev);
kill_bdev(bdev);
inode->i_blkbits = blksize_bits(size);
mapping_set_folio_min_order(inode->i_mapping, get_order(size));
kill_bdev(bdev);
filemap_invalidate_unlock(inode->i_mapping);
inode_unlock(inode);
}
return 0;
}
@@ -777,13 +815,13 @@ static void blkdev_put_part(struct block_device *part)
blkdev_put_whole(whole);
}
struct block_device *blkdev_get_no_open(dev_t dev)
struct block_device *blkdev_get_no_open(dev_t dev, bool autoload)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (inode)
@@ -1005,7 +1043,7 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
return ERR_PTR(ret);
bdev = blkdev_get_no_open(dev);
bdev = blkdev_get_no_open(dev, true);
if (!bdev)
return ERR_PTR(-ENXIO);
@@ -1275,18 +1313,15 @@ void sync_bdevs(bool wait)
void bdev_statx(struct path *path, struct kstat *stat,
u32 request_mask)
{
struct inode *backing_inode;
struct block_device *bdev;
backing_inode = d_backing_inode(path->dentry);
/*
* Note that backing_inode is the inode of a block device node file,
* not the block device's internal inode. Therefore it is *not* valid
* to use I_BDEV() here; the block device has to be looked up by i_rdev
* Note that d_backing_inode() returns the block device node inode, not
* the block device's internal inode. Therefore it is *not* valid to
* use I_BDEV() here; the block device has to be looked up by i_rdev
* instead.
*/
bdev = blkdev_get_no_open(backing_inode->i_rdev);
bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false);
if (!bdev)
return;

View File

@@ -9,6 +9,7 @@
* not aware of PI.
*/
#include <linux/blk-integrity.h>
#include <linux/t10-pi.h>
#include <linux/workqueue.h>
#include "blk.h"
@@ -43,6 +44,29 @@ static void bio_integrity_verify_fn(struct work_struct *work)
bio_endio(bio);
}
#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
static bool bip_should_check(struct bio_integrity_payload *bip)
{
return bip->bip_flags & BIP_CHECK_FLAGS;
}
static bool bi_offload_capable(struct blk_integrity *bi)
{
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
return bi->tuple_size == sizeof(struct crc64_pi_tuple);
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
return bi->tuple_size == sizeof(struct t10_pi_tuple);
default:
pr_warn_once("%s: unknown integrity checksum type:%d\n",
__func__, bi->csum_type);
fallthrough;
case BLK_INTEGRITY_CSUM_NONE:
return false;
}
}
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
@@ -54,12 +78,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
*/
bool __bio_integrity_endio(struct bio *bio)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_data *bid =
container_of(bip, struct bio_integrity_data, bip);
if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
bip_should_check(bip)) {
INIT_WORK(&bid->work, bio_integrity_verify_fn);
queue_work(kintegrityd_wq, &bid->work);
return false;
@@ -84,6 +108,7 @@ bool bio_integrity_prep(struct bio *bio)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_data *bid;
bool set_flags = true;
gfp_t gfp = GFP_NOIO;
unsigned int len;
void *buf;
@@ -100,19 +125,24 @@ bool bio_integrity_prep(struct bio *bio)
switch (bio_op(bio)) {
case REQ_OP_READ:
if (bi->flags & BLK_INTEGRITY_NOVERIFY)
return true;
if (bi->flags & BLK_INTEGRITY_NOVERIFY) {
if (bi_offload_capable(bi))
return true;
set_flags = false;
}
break;
case REQ_OP_WRITE:
if (bi->flags & BLK_INTEGRITY_NOGENERATE)
return true;
/*
* Zero the memory allocated to not leak uninitialized kernel
* memory to disk for non-integrity metadata where nothing else
* initializes the memory.
*/
if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
if (bi->flags & BLK_INTEGRITY_NOGENERATE) {
if (bi_offload_capable(bi))
return true;
set_flags = false;
gfp |= __GFP_ZERO;
} else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
gfp |= __GFP_ZERO;
break;
default:
@@ -137,19 +167,21 @@ bool bio_integrity_prep(struct bio *bio)
bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
bid->bip.bip_flags |= BIP_IP_CHECKSUM;
if (bi->csum_type)
bid->bip.bip_flags |= BIP_CHECK_GUARD;
if (bi->flags & BLK_INTEGRITY_REF_TAG)
bid->bip.bip_flags |= BIP_CHECK_REFTAG;
if (set_flags) {
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
bid->bip.bip_flags |= BIP_IP_CHECKSUM;
if (bi->csum_type)
bid->bip.bip_flags |= BIP_CHECK_GUARD;
if (bi->flags & BLK_INTEGRITY_REF_TAG)
bid->bip.bip_flags |= BIP_CHECK_REFTAG;
}
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len)
goto err_end_io;
/* Auto-generate integrity metadata if this is a write */
if (bio_data_dir(bio) == WRITE)
if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
blk_integrity_generate(bio);
else
bid->saved_bio_iter = bio->bi_iter;

View File

@@ -66,16 +66,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
}
EXPORT_SYMBOL(bio_integrity_alloc);
static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
bool dirty)
static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs)
{
int i;
for (i = 0; i < nr_vecs; i++) {
if (dirty && !PageCompound(bv[i].bv_page))
set_page_dirty_lock(bv[i].bv_page);
for (i = 0; i < nr_vecs; i++)
unpin_user_page(bv[i].bv_page);
}
}
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
@@ -91,7 +87,7 @@ static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs);
}
/**
@@ -111,8 +107,7 @@ void bio_integrity_unmap_user(struct bio *bio)
return;
}
bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt,
bio_data_dir(bio) == READ);
bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt);
}
/**
@@ -198,7 +193,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
if (write)
bio_integrity_unpin_bvec(bvec, nr_vecs, false);
bio_integrity_unpin_bvec(bvec, nr_vecs);
else
memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
@@ -319,7 +314,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
return 0;
release_pages:
bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
bio_integrity_unpin_bvec(bvec, nr_bvecs);
free_bvec:
if (bvec != stack_vec)
kfree(bvec);

View File

@@ -797,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -EINVAL;
input = skip_spaces(input);
bdev = blkdev_get_no_open(MKDEV(major, minor));
bdev = blkdev_get_no_open(MKDEV(major, minor), false);
if (!bdev)
return -ENODEV;
if (bdev_is_partition(bdev)) {

View File

@@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*
* There is no hardware limitation for the read-ahead size and the user
* might have increased the read-ahead size through sysfs, so don't ever
* decrease it.
*/
bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
bdi->ra_pages = max3(bdi->ra_pages,
lim->io_opt * 2 / PAGE_SIZE,
VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}

View File

@@ -909,6 +909,8 @@ int blk_register_queue(struct gendisk *disk)
out_debugfs_remove:
blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
return ret;

View File

@@ -1,3 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H

View File

@@ -343,6 +343,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
@@ -364,8 +365,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
if (cmd == BLKRESETZONE)
if (cmd == BLKRESETZONE) {
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
}
return ret;
}

View File

@@ -94,6 +94,9 @@ static inline void blk_wait_io(struct completion *done)
wait_for_completion_io(done);
}
struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
void blkdev_put_no_open(struct block_device *bdev);
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
@@ -477,7 +480,8 @@ static inline void blk_zone_update_request_bio(struct request *rq,
* the original BIO sector so that blk_zone_write_plug_bio_endio() can
* lookup the zone write plug.
*/
if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
if (req_op(rq) == REQ_OP_ZONE_APPEND ||
bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);

View File

@@ -642,7 +642,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
bdev = blkdev_get_no_open(inode->i_rdev);
bdev = blkdev_get_no_open(inode->i_rdev, true);
if (!bdev)
return -ENXIO;
@@ -746,7 +746,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
/*
* Take i_rwsem and invalidate_lock to avoid racing with
* set_blocksize changing i_blkbits/folio order and punching
* out the pagecache.
*/
inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -757,6 +764,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -793,7 +801,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto reexpand;
}
/*
* Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
* changing i_blkbits/folio order and punching out the pagecache.
*/
inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -836,6 +850,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
/*
@@ -868,6 +883,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
fail:
filemap_invalidate_unlock(inode->i_mapping);
inode_unlock(inode);
return error;
}

View File

@@ -142,6 +142,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (err)
return err;
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
@@ -174,6 +175,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -199,12 +201,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
end > bdev_nr_bytes(bdev))
return -EINVAL;
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -236,6 +240,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
@@ -246,6 +251,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
return err;
}

View File

@@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio)
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
if (level >= IOPRIO_NR_LEVELS)
return -EINVAL;
break;
case IOPRIO_CLASS_BE:
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:

View File

@@ -388,12 +388,6 @@ config BLK_DEV_UBLK
definition isn't finalized yet, and might change according to future
requirement, so mark is as experimental now.
Say Y if you want to get better performance because task_work_add()
can be used in IO path for replacing io_uring cmd, which will become
shared between IO tasks and ubq daemon, meantime task_work_add() can
can handle batch more effectively, but task_work_add() isn't exported
for module, so ublk has to be built to kernel.
config BLKDEV_UBLK_LEGACY_OPCODES
bool "Support legacy command opcode"
depends on BLK_DEV_UBLK

View File

@@ -211,72 +211,6 @@ static void loop_set_size(struct loop_device *lo, loff_t size)
kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
}
static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
{
struct iov_iter i;
ssize_t bw;
iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);
bw = vfs_iter_write(file, &i, ppos, 0);
if (likely(bw == bvec->bv_len))
return 0;
printk_ratelimited(KERN_ERR
"loop: Write error at byte offset %llu, length %i.\n",
(unsigned long long)*ppos, bvec->bv_len);
if (bw >= 0)
bw = -EIO;
return bw;
}
static int lo_write_simple(struct loop_device *lo, struct request *rq,
loff_t pos)
{
struct bio_vec bvec;
struct req_iterator iter;
int ret = 0;
rq_for_each_segment(bvec, rq, iter) {
ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
if (ret < 0)
break;
cond_resched();
}
return ret;
}
static int lo_read_simple(struct loop_device *lo, struct request *rq,
loff_t pos)
{
struct bio_vec bvec;
struct req_iterator iter;
struct iov_iter i;
ssize_t len;
rq_for_each_segment(bvec, rq, iter) {
iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len);
len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
if (len < 0)
return len;
flush_dcache_page(bvec.bv_page);
if (len != bvec.bv_len) {
struct bio *bio;
__rq_for_each_bio(bio, rq)
zero_fill_bio(bio);
break;
}
cond_resched();
}
return 0;
}
static void loop_clear_limits(struct loop_device *lo, int mode)
{
struct queue_limits lim = queue_limits_start_update(lo->lo_queue);
@@ -342,7 +276,7 @@ static void lo_complete_rq(struct request *rq)
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
blk_status_t ret = BLK_STS_OK;
if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
req_op(rq) != REQ_OP_READ) {
if (cmd->ret < 0)
ret = errno_to_blk_status(cmd->ret);
@@ -358,14 +292,13 @@ static void lo_complete_rq(struct request *rq)
cmd->ret = 0;
blk_mq_requeue_request(rq, true);
} else {
if (cmd->use_aio) {
struct bio *bio = rq->bio;
struct bio *bio = rq->bio;
while (bio) {
zero_fill_bio(bio);
bio = bio->bi_next;
}
while (bio) {
zero_fill_bio(bio);
bio = bio->bi_next;
}
ret = BLK_STS_IOERR;
end_io:
blk_mq_end_request(rq, ret);
@@ -445,9 +378,14 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
cmd->iocb.ki_pos = pos;
cmd->iocb.ki_filp = file;
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
cmd->iocb.ki_ioprio = req_get_ioprio(rq);
if (cmd->use_aio) {
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
} else {
cmd->iocb.ki_complete = NULL;
cmd->iocb.ki_flags = 0;
}
if (rw == ITER_SOURCE)
ret = file->f_op->write_iter(&cmd->iocb, &iter);
@@ -458,7 +396,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
if (ret != -EIOCBQUEUED)
lo_rw_aio_complete(&cmd->iocb, ret);
return 0;
return -EIOCBQUEUED;
}
static int do_req_filebacked(struct loop_device *lo, struct request *rq)
@@ -466,15 +404,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
/*
* lo_write_simple and lo_read_simple should have been covered
* by io submit style function like lo_rw_aio(), one blocker
* is that lo_read_simple() need to call flush_dcache_page after
* the page is written from kernel, and it isn't easy to handle
* this in io submit style function which submits all segments
* of the req at one time. And direct read IO doesn't need to
* run flush_dcache_page().
*/
switch (req_op(rq)) {
case REQ_OP_FLUSH:
return lo_req_flush(lo, rq);
@@ -490,15 +419,9 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
case REQ_OP_DISCARD:
return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
case REQ_OP_WRITE:
if (cmd->use_aio)
return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
else
return lo_write_simple(lo, rq, pos);
return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
case REQ_OP_READ:
if (cmd->use_aio)
return lo_rw_aio(lo, cmd, pos, ITER_DEST);
else
return lo_read_simple(lo, rq, pos);
return lo_rw_aio(lo, cmd, pos, ITER_DEST);
default:
WARN_ON_ONCE(1);
return -EIO;
@@ -582,6 +505,17 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
lo->lo_min_dio_size = loop_query_min_dio_size(lo);
}
static int loop_check_backing_file(struct file *file)
{
if (!file->f_op->read_iter)
return -EINVAL;
if ((file->f_mode & FMODE_WRITE) && !file->f_op->write_iter)
return -EINVAL;
return 0;
}
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
@@ -603,6 +537,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
if (!file)
return -EBADF;
error = loop_check_backing_file(file);
if (error)
return error;
/* suppress uevents while reconfiguring the device */
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);
@@ -662,19 +600,20 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
* dependency.
*/
fput(old_file);
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
if (partscan)
loop_reread_partitions(lo);
error = 0;
done:
/* enable and uncork uevent now that we are done */
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
return error;
out_err:
loop_global_unlock(lo, is_loop);
out_putf:
fput(file);
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
goto done;
}
@@ -1039,6 +978,14 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
if (!file)
return -EBADF;
if ((mode & BLK_OPEN_WRITE) && !file->f_op->write_iter)
return -EINVAL;
error = loop_check_backing_file(file);
if (error)
return error;
is_loop = is_loop_device(file);
/* This is safe, since we have a reference from open(). */
@@ -1129,8 +1076,8 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
if (partscan)
clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
/* enable and uncork uevent now that we are done */
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
loop_global_unlock(lo, is_loop);
if (partscan)
@@ -1921,7 +1868,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
struct loop_device *lo = rq->q->queuedata;
int ret = 0;
struct mem_cgroup *old_memcg = NULL;
const bool use_aio = cmd->use_aio;
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
ret = -EIO;
@@ -1951,7 +1897,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
}
failed:
/* complete non-aio request */
if (!use_aio || ret) {
if (ret != -EIOCBQUEUED) {
if (ret == -EOPNOTSUPP)
cmd->ret = ret;
else

View File

@@ -122,15 +122,6 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
/*
* IO command is aborted, so this flag is set in case of
* !UBLK_IO_FLAG_ACTIVE.
*
* After this flag is observed, any pending or new incoming request
* associated with this io command will be failed immediately
*/
#define UBLK_IO_FLAG_ABORTED 0x04
/*
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
* get data buffer address from ublksrv.
@@ -199,8 +190,6 @@ struct ublk_device {
struct completion completion;
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
struct work_struct nosrv_work;
};
/* header of ublk_params */
@@ -209,18 +198,13 @@ struct ublk_params_header {
__u32 types;
};
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
struct ublk_queue *ubq, int tag, size_t offset);
const struct ublk_queue *ubq, int tag, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -620,14 +604,19 @@ static void ublk_apply_params(struct ublk_device *ub)
ublk_dev_param_zoned_apply(ub);
}
static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
return ubq->flags & UBLK_F_USER_COPY;
}
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq);
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -635,8 +624,11 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
/*
* read()/write() is involved in user copy, so request reference
* has to be grabbed
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
*/
return ublk_support_user_copy(ubq);
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -1074,7 +1066,7 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
{
return ubq->ubq_daemon->flags & PF_EXITING;
return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING;
}
/* todo: handle partial completion */
@@ -1085,12 +1077,6 @@ static inline void __ublk_complete_rq(struct request *req)
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
/* called from ublk_abort_queue() code path */
if (io->flags & UBLK_IO_FLAG_ABORTED) {
res = BLK_STS_IOERR;
goto exit;
}
/* failed read IO if nothing is read */
if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
@@ -1140,47 +1126,6 @@ static void ublk_complete_rq(struct kref *ref)
__ublk_complete_rq(req);
}
static void ublk_do_fail_rq(struct request *req)
{
struct ublk_queue *ubq = req->mq_hctx->driver_data;
if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
blk_mq_requeue_request(req, false);
else
__ublk_complete_rq(req);
}
static void ublk_fail_rq_fn(struct kref *ref)
{
struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
ref);
struct request *req = blk_mq_rq_from_pdu(data);
ublk_do_fail_rq(req);
}
/*
* Since ublk_rq_task_work_cb always fails requests immediately during
* exiting, __ublk_fail_req() is only called from abort context during
* exiting. So lock is unnecessary.
*
* Also aborting may not be started yet, keep in mind that one failed
* request may be issued by block layer again.
*/
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
kref_put(&data->ref, ublk_fail_rq_fn);
} else {
ublk_do_fail_rq(req);
}
}
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
unsigned issue_flags)
{
@@ -1336,8 +1281,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
unsigned int nr_inflight = 0;
int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
if (!ubq->timeout) {
@@ -1348,26 +1291,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_DONE;
}
if (!ubq_daemon_is_dying(ubq))
return BLK_EH_RESET_TIMER;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
nr_inflight++;
}
/* cancelable uring_cmd can't help us if all commands are in-flight */
if (nr_inflight == ubq->q_depth) {
struct ublk_device *ub = ubq->dev;
if (ublk_abort_requests(ub, ubq)) {
schedule_work(&ub->nosrv_work);
}
return BLK_EH_DONE;
}
return BLK_EH_RESET_TIMER;
}
@@ -1470,6 +1393,37 @@ static const struct blk_mq_ops ublk_mq_ops = {
.timeout = ublk_timeout,
};
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
/* All old ioucmds have to be completed */
ubq->nr_io_ready = 0;
/*
* old daemon is PF_EXITING, put it now
*
* It could be NULL in case of closing one quisced device.
*/
if (ubq->ubq_daemon)
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
ubq->timeout = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
/*
* UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
* io->cmd
*/
io->flags &= UBLK_IO_FLAG_CANCELED;
io->cmd = NULL;
io->addr = 0;
}
}
static int ublk_ch_open(struct inode *inode, struct file *filp)
{
struct ublk_device *ub = container_of(inode->i_cdev,
@@ -1481,10 +1435,119 @@ static int ublk_ch_open(struct inode *inode, struct file *filp)
return 0;
}
static void ublk_reset_ch_dev(struct ublk_device *ub)
{
int i;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
ub->mm = NULL;
ub->nr_queues_ready = 0;
ub->nr_privileged_daemon = 0;
}
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
{
struct gendisk *disk;
spin_lock(&ub->lock);
disk = ub->ub_disk;
if (disk)
get_device(disk_to_dev(disk));
spin_unlock(&ub->lock);
return disk;
}
static void ublk_put_disk(struct gendisk *disk)
{
if (disk)
put_device(disk_to_dev(disk));
}
static int ublk_ch_release(struct inode *inode, struct file *filp)
{
struct ublk_device *ub = filp->private_data;
struct gendisk *disk;
int i;
/*
* disk isn't attached yet, either device isn't live, or it has
* been removed already, so we needn't to do anything
*/
disk = ublk_get_disk(ub);
if (!disk)
goto out;
/*
* All uring_cmd are done now, so abort any request outstanding to
* the ublk server
*
* This can be done in lockless way because ublk server has been
* gone
*
* More importantly, we have to provide forward progress guarantee
* without holding ub->mutex, otherwise control task grabbing
* ub->mutex triggers deadlock
*
* All requests may be inflight, so ->canceling may not be set, set
* it now.
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
struct ublk_queue *ubq = ublk_get_queue(ub, i);
ubq->canceling = true;
ublk_abort_queue(ub, ubq);
}
blk_mq_kick_requeue_list(disk->queue);
/*
* All infligh requests have been completed or requeued and any new
* request will be failed or requeued via `->canceling` now, so it is
* fine to grab ub->mutex now.
*/
mutex_lock(&ub->mutex);
/* double check after grabbing lock */
if (!ub->ub_disk)
goto unlock;
/*
* Transition the device to the nosrv state. What exactly this
* means depends on the recovery flags
*/
blk_mq_quiesce_queue(disk->queue);
if (ublk_nosrv_should_stop_dev(ub)) {
/*
* Allow any pending/future I/O to pass through quickly
* with an error. This is needed because del_gendisk
* waits for all pending I/O to complete
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->force_abort = true;
blk_mq_unquiesce_queue(disk->queue);
ublk_stop_dev_unlocked(ub);
} else {
if (ublk_nosrv_dev_should_queue_io(ub)) {
/* ->canceling is set and all requests are aborted */
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
} else {
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->fail_io = true;
}
blk_mq_unquiesce_queue(disk->queue);
}
unlock:
mutex_unlock(&ub->mutex);
ublk_put_disk(disk);
/* all uring_cmd has been done now, reset device & ubq */
ublk_reset_ch_dev(ub);
out:
clear_bit(UB_STATE_OPEN, &ub->state);
return 0;
}
@@ -1551,10 +1614,26 @@ static void ublk_commit_completion(struct ublk_device *ub,
ublk_put_req_ref(ubq, req);
}
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
blk_mq_requeue_request(req, false);
else {
io->res = -EIO;
__ublk_complete_rq(req);
}
}
/*
* Called from ubq_daemon context via cancel fn, meantime quiesce ublk
* blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
* context, so everything is serialized.
* Called from ublk char device release handler, when any uring_cmd is
* done, meantime request queue is "quiesced" since all inflight requests
* can't be completed because ublk server is dead.
*
* So no one can hold our request IO reference any more, simply ignore the
* reference, and complete the request immediately
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
@@ -1571,46 +1650,29 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
if (rq && blk_mq_request_started(rq)) {
io->flags |= UBLK_IO_FLAG_ABORTED;
if (rq && blk_mq_request_started(rq))
__ublk_fail_req(ubq, io, rq);
}
}
}
}
/* Must be called when queue is frozen */
static bool ublk_mark_queue_canceling(struct ublk_queue *ubq)
static void ublk_mark_queue_canceling(struct ublk_queue *ubq)
{
bool canceled;
spin_lock(&ubq->cancel_lock);
canceled = ubq->canceling;
if (!canceled)
if (!ubq->canceling)
ubq->canceling = true;
spin_unlock(&ubq->cancel_lock);
return canceled;
}
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
static void ublk_start_cancel(struct ublk_queue *ubq)
{
bool was_canceled = ubq->canceling;
struct gendisk *disk;
if (was_canceled)
return false;
spin_lock(&ub->lock);
disk = ub->ub_disk;
if (disk)
get_device(disk_to_dev(disk));
spin_unlock(&ub->lock);
struct ublk_device *ub = ubq->dev;
struct gendisk *disk = ublk_get_disk(ub);
/* Our disk has been dead */
if (!disk)
return false;
return;
/*
* Now we are serialized with ublk_queue_rq()
*
@@ -1619,25 +1681,36 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
* touch completed uring_cmd
*/
blk_mq_quiesce_queue(disk->queue);
was_canceled = ublk_mark_queue_canceling(ubq);
if (!was_canceled) {
/* abort queue is for making forward progress */
ublk_abort_queue(ub, ubq);
}
ublk_mark_queue_canceling(ubq);
blk_mq_unquiesce_queue(disk->queue);
put_device(disk_to_dev(disk));
return !was_canceled;
ublk_put_disk(disk);
}
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
unsigned int issue_flags)
{
struct ublk_io *io = &ubq->ios[tag];
struct ublk_device *ub = ubq->dev;
struct request *req;
bool done;
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
return;
/*
* Don't try to cancel this command if the request is started for
* avoiding race between io_uring_cmd_done() and
* io_uring_cmd_complete_in_task().
*
* Either the started request will be aborted via __ublk_abort_rq(),
* then this uring_cmd is canceled next time, or it will be done in
* task work function ublk_dispatch_req() because io_uring guarantees
* that ublk_dispatch_req() is always called
*/
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
if (req && blk_mq_request_started(req))
return;
spin_lock(&ubq->cancel_lock);
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
if (!done)
@@ -1651,6 +1724,17 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
/*
* The ublk char device won't be closed when calling cancel fn, so both
* ublk device and queue are guaranteed to be live
*
* Two-stage cancel:
*
* - make every active uring_cmd done in ->cancel_fn()
*
* - aborting inflight ublk IO requests in ublk char device release handler,
* which depends on 1st stage because device can only be closed iff all
* uring_cmd are done
*
* Do _not_ try to acquire ub->mutex before all inflight requests are
* aborted, otherwise deadlock may be caused.
*/
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
unsigned int issue_flags)
@@ -1658,9 +1742,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->ubq;
struct task_struct *task;
struct ublk_device *ub;
bool need_schedule;
struct ublk_io *io;
if (WARN_ON_ONCE(!ubq))
return;
@@ -1672,16 +1753,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
return;
ub = ubq->dev;
need_schedule = ublk_abort_requests(ub, ubq);
if (!ubq->canceling)
ublk_start_cancel(ubq);
io = &ubq->ios[pdu->tag];
WARN_ON_ONCE(io->cmd != cmd);
ublk_cancel_cmd(ubq, io, issue_flags);
if (need_schedule) {
schedule_work(&ub->nosrv_work);
}
WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd);
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1694,7 +1770,7 @@ static void ublk_cancel_queue(struct ublk_queue *ubq)
int i;
for (i = 0; i < ubq->q_depth; i++)
ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -1732,33 +1808,20 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
}
}
static void __ublk_quiesce_dev(struct ublk_device *ub)
static void ublk_force_abort_dev(struct ublk_device *ub)
{
pr_devel("%s: quiesce ub: dev_id %d state %s\n",
int i;
pr_devel("%s: force abort ub: dev_id %d state %s\n",
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ?
"LIVE" : "QUIESCED");
blk_mq_quiesce_queue(ub->ub_disk->queue);
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
}
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
ublk_wait_tagset_rqs_idle(ub);
static void ublk_unquiesce_dev(struct ublk_device *ub)
{
int i;
pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ?
"LIVE" : "QUIESCED");
/* quiesce_work has run. We let requeued rqs be aborted
* before running fallback_wq. "force_abort" must be seen
* after request queue is unqiuesced. Then del_gendisk()
* can move on.
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->force_abort = true;
blk_mq_unquiesce_queue(ub->ub_disk->queue);
/* We may have requeued some rqs in ublk_quiesce_queue() */
blk_mq_kick_requeue_list(ub->ub_disk->queue);
@@ -1779,61 +1842,51 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
return disk;
}
static void ublk_stop_dev(struct ublk_device *ub)
static void ublk_stop_dev_unlocked(struct ublk_device *ub)
__must_hold(&ub->mutex)
{
struct gendisk *disk;
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto unlock;
if (ublk_nosrv_dev_should_queue_io(ub)) {
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
__ublk_quiesce_dev(ub);
ublk_unquiesce_dev(ub);
}
return;
if (ublk_nosrv_dev_should_queue_io(ub))
ublk_force_abort_dev(ub);
del_gendisk(ub->ub_disk);
disk = ublk_detach_disk(ub);
put_disk(disk);
unlock:
}
static void ublk_stop_dev(struct ublk_device *ub)
{
mutex_lock(&ub->mutex);
ublk_stop_dev_unlocked(ub);
mutex_unlock(&ub->mutex);
ublk_cancel_dev(ub);
}
static void ublk_nosrv_work(struct work_struct *work)
/* reset ublk io_uring queue & io flags */
static void ublk_reset_io_flags(struct ublk_device *ub)
{
struct ublk_device *ub =
container_of(work, struct ublk_device, nosrv_work);
int i;
int i, j;
if (ublk_nosrv_should_stop_dev(ub)) {
ublk_stop_dev(ub);
return;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
struct ublk_queue *ubq = ublk_get_queue(ub, i);
/* UBLK_IO_FLAG_CANCELED can be cleared now */
spin_lock(&ubq->cancel_lock);
for (j = 0; j < ubq->q_depth; j++)
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
spin_unlock(&ubq->cancel_lock);
ubq->canceling = false;
ubq->fail_io = false;
}
mutex_lock(&ub->mutex);
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
goto unlock;
if (ublk_nosrv_dev_should_queue_io(ub)) {
__ublk_quiesce_dev(ub);
} else {
blk_mq_quiesce_queue(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
ublk_get_queue(ub, i)->fail_io = true;
}
blk_mq_unquiesce_queue(ub->ub_disk->queue);
}
unlock:
mutex_unlock(&ub->mutex);
ublk_cancel_dev(ub);
}
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
__must_hold(&ub->mutex)
{
mutex_lock(&ub->mutex);
ubq->nr_io_ready++;
if (ublk_queue_ready(ubq)) {
ubq->ubq_daemon = current;
@@ -1843,18 +1896,12 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
if (capable(CAP_SYS_ADMIN))
ub->nr_privileged_daemon++;
}
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
/* now we are ready for handling ublk io request */
ublk_reset_io_flags(ub);
complete_all(&ub->completion);
mutex_unlock(&ub->mutex);
}
static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
int tag)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
ublk_queue_cmd(ubq, req);
}
}
static inline int ublk_check_cmd_op(u32 cmd_op)
@@ -1902,13 +1949,20 @@ static void ublk_io_release(void *priv)
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
struct ublk_queue *ubq, unsigned int tag,
const struct ublk_queue *ubq, unsigned int tag,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
return -EINVAL;
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
if (!req)
return -EINVAL;
@@ -1924,11 +1978,66 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
const struct ublk_queue *ubq, unsigned int tag,
unsigned int index, unsigned int issue_flags)
{
const struct ublk_io *io = &ubq->ios[tag];
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
return -EINVAL;
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
struct ublk_io *io, __u64 buf_addr)
{
struct ublk_device *ub = ubq->dev;
int ret = 0;
/*
* When handling FETCH command for setting up ublk uring queue,
* ub->mutex is the innermost lock, and we won't block for handling
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
*/
mutex_lock(&ub->mutex);
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
ret = -EBUSY;
goto out;
}
/* allow each command to be FETCHed at most once */
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
ret = -EINVAL;
goto out;
}
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
if (ublk_need_map_io(ubq)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
*/
if (!buf_addr && !ublk_need_get_data(ubq))
goto out;
} else if (buf_addr) {
/* User copy requires addr to be unset */
ret = -EINVAL;
goto out;
}
ublk_fill_io_cmd(io, cmd, buf_addr);
ublk_mark_io_ready(ub, ubq);
out:
mutex_unlock(&ub->mutex);
return ret;
}
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1983,35 +2092,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags);
return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
ret = -EBUSY;
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
if (ret)
goto out;
}
/*
* The io is being handled by server, so COMMIT_RQ is expected
* instead of FETCH_REQ
*/
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
goto out;
if (ublk_need_map_io(ubq)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
*/
if (!ub_cmd->addr && !ublk_need_get_data(ubq))
goto out;
} else if (ub_cmd->addr) {
/* User copy requires addr to be unset */
ret = -EINVAL;
goto out;
}
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
@@ -2043,8 +2128,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
break;
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
ublk_dispatch_req(ubq, req, issue_flags);
return -EIOCBQUEUED;
default:
goto out;
}
@@ -2058,13 +2144,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
struct ublk_queue *ubq, int tag, size_t offset)
const struct ublk_queue *ubq, int tag, size_t offset)
{
struct request *req;
if (!ublk_need_req_ref(ubq))
return NULL;
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
if (!req)
return NULL;
@@ -2178,6 +2261,9 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
if (!ubq)
return ERR_PTR(-EINVAL);
if (!ublk_support_user_copy(ubq))
return ERR_PTR(-EACCES);
if (tag >= ubq->q_depth)
return ERR_PTR(-EINVAL);
@@ -2411,7 +2497,6 @@ static void ublk_remove(struct ublk_device *ub)
bool unprivileged;
ublk_stop_dev(ub);
cancel_work_sync(&ub->nosrv_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
ublk_put_device(ub);
@@ -2696,7 +2781,6 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
INIT_WORK(&ub->nosrv_work, ublk_nosrv_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2718,13 +2802,18 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY */
if (ublk_dev_is_user_copy(ub))
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/* Zoned storage support requires user copy feature */
/*
* Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
* returning write_append_lba, which is only allowed in case of
* user copy or zero copy
*/
if (ublk_dev_is_zoned(ub) &&
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
(UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
ret = -EINVAL;
goto out_free_dev_number;
}
@@ -2828,7 +2917,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
{
ublk_stop_dev(ub);
cancel_work_sync(&ub->nosrv_work);
return 0;
}
@@ -2932,42 +3020,14 @@ static int ublk_ctrl_set_params(struct ublk_device *ub,
return ret;
}
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
/* All old ioucmds have to be completed */
ubq->nr_io_ready = 0;
/* old daemon is PF_EXITING, put it now */
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
ubq->timeout = false;
ubq->canceling = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
/* forget everything now and be ready for new FETCH_REQ */
io->flags = 0;
io->cmd = NULL;
io->addr = 0;
}
}
static int ublk_ctrl_start_recovery(struct ublk_device *ub,
const struct ublksrv_ctrl_cmd *header)
{
int ret = -EINVAL;
int i;
mutex_lock(&ub->mutex);
if (ublk_nosrv_should_stop_dev(ub))
goto out_unlock;
if (!ub->nr_queues_ready)
goto out_unlock;
/*
* START_RECOVERY is only allowd after:
*
@@ -2991,12 +3051,6 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
goto out_unlock;
}
pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
ub->mm = NULL;
ub->nr_queues_ready = 0;
ub->nr_privileged_daemon = 0;
init_completion(&ub->completion);
ret = 0;
out_unlock:
@@ -3009,7 +3063,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
{
int ublksrv_pid = (int)header->data[0];
int ret = -EINVAL;
int i;
pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
@@ -3029,24 +3082,10 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
goto out_unlock;
}
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->dev_info.state = UBLK_S_DEV_LIVE;
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
__func__, ublksrv_pid, header->dev_id);
if (ublk_nosrv_dev_should_queue_io(ub)) {
ub->dev_info.state = UBLK_S_DEV_LIVE;
blk_mq_unquiesce_queue(ub->ub_disk->queue);
pr_devel("%s: queue unquiesced, dev id %d.\n",
__func__, header->dev_id);
blk_mq_kick_requeue_list(ub->ub_disk->queue);
} else {
blk_mq_quiesce_queue(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_LIVE;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
ublk_get_queue(ub, i)->fail_io = false;
}
blk_mq_unquiesce_queue(ub->ub_disk->queue);
}
blk_mq_kick_requeue_list(ub->ub_disk->queue);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);

View File

@@ -2357,9 +2357,8 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
if (bitmap->mddev->bitmap_info.external)
return -ENOENT;
if (!bitmap->storage.sb_page) /* no superblock */
if (!bitmap->mddev->bitmap_info.external &&
!bitmap->storage.sb_page)
return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);

View File

@@ -2200,14 +2200,9 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
if (!rdev_set_badblocks(rdev, sect, s, 0))
abort = 1;
}
if (abort) {
conf->recovery_disabled =
mddev->recovery_disabled;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
if (abort)
return 0;
}
/* Try next page */
sectors -= s;
sect += s;
@@ -2346,10 +2341,21 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
int disks = conf->raid_disks * 2;
struct bio *wbio;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
/* ouch - failed to read all of that. */
if (!fix_sync_read_error(r1_bio))
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
/*
* ouch - failed to read all of that.
* No need to fix read error for check/repair
* because all member disks are read.
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
!fix_sync_read_error(r1_bio)) {
conf->recovery_disabled = mddev->recovery_disabled;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
return;
}
}
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
process_checks(r1_bio);

View File

@@ -1735,6 +1735,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
* The discard bio returns only first r10bio finishes
*/
if (first_copy) {
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
set_bit(R10BIO_Discard, &r10_bio->state);
first_copy = false;

View File

@@ -102,6 +102,7 @@ config NVME_TCP_TLS
depends on NVME_TCP
select NET_HANDSHAKE
select KEYS
select TLS
help
Enables TLS encryption for NVMe TCP using the netlink handshake API.

View File

@@ -4300,7 +4300,7 @@ static void nvme_scan_work(struct work_struct *work)
if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
nvme_queue_scan(ctrl);
#ifdef CONFIG_NVME_MULTIPATH
else
else if (ctrl->ana_log_buf)
/* Re-read the ANA log page to not miss updates */
queue_work(nvme_wq, &ctrl->ana_work);
#endif
@@ -4493,7 +4493,8 @@ static void nvme_fw_act_work(struct work_struct *work)
msleep(100);
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING) ||
!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
nvme_unquiesce_io_queues(ctrl);

View File

@@ -1050,6 +1050,13 @@ void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
srcu_idx = srcu_read_lock(&head->srcu);
list_for_each_entry_rcu(ns, &head->list, siblings) {
/*
* Ensure that ns path disk node is already added otherwise we
* may get invalid kobj name for target
*/
if (!test_bit(GD_ADDED, &ns->disk->state))
continue;
/*
* Avoid creating link if it already exists for the given path.
* When path ana state transitions from optimized to non-
@@ -1065,13 +1072,6 @@ void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
continue;
/*
* Ensure that ns path disk node is already added otherwise we
* may get invalid kobj name for target
*/
if (!test_bit(GD_ADDED, &ns->disk->state))
continue;
target = disk_to_dev(ns->disk);
/*
* Create sysfs link from head gendisk kobject @kobj to the

View File

@@ -3575,7 +3575,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
dev_info(dev->ctrl.device, "restart after slot reset\n");
pci_restore_state(pdev);
if (!nvme_try_sched_reset(&dev->ctrl))
if (nvme_try_sched_reset(&dev->ctrl))
nvme_unquiesce_io_queues(&dev->ctrl);
return PCI_ERS_RESULT_RECOVERED;
}
@@ -3623,6 +3623,9 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
.driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, },
{ PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_BOGUS_NID, },
@@ -3646,6 +3649,9 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */
.driver_data = NVME_QUIRK_BROKEN_MSI },
{ PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */
.driver_data = NVME_QUIRK_BROKEN_MSI |
NVME_QUIRK_NO_DEEPEST_PS },
{ PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */

View File

@@ -1946,7 +1946,7 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
cancel_work_sync(&queue->io_work);
}
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
@@ -1965,6 +1965,31 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
mutex_unlock(&queue->queue_lock);
}
static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
int timeout = 100;
while (timeout > 0) {
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) ||
!sk_wmem_alloc_get(queue->sock->sk))
return;
msleep(2);
timeout -= 2;
}
dev_warn(nctrl->device,
"qid %d: timeout draining sock wmem allocation expired\n",
qid);
}
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
{
nvme_tcp_stop_queue_nowait(nctrl, qid);
nvme_tcp_wait_queue(nctrl, qid);
}
static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
{
write_lock_bh(&queue->sock->sk->sk_callback_lock);
@@ -2032,7 +2057,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
int i;
for (i = 1; i < ctrl->queue_count; i++)
nvme_tcp_stop_queue(ctrl, i);
nvme_tcp_stop_queue_nowait(ctrl, i);
for (i = 1; i < ctrl->queue_count; i++)
nvme_tcp_wait_queue(ctrl, i);
}
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,

View File

@@ -98,6 +98,7 @@ config NVME_TARGET_TCP_TLS
bool "NVMe over Fabrics TCP target TLS encryption support"
depends on NVME_TARGET_TCP
select NET_HANDSHAKE
select TLS
help
Enables TLS encryption for the NVMe TCP target using the netlink handshake API.

View File

@@ -240,7 +240,7 @@ void nvmet_auth_sq_free(struct nvmet_sq *sq)
{
cancel_delayed_work(&sq->auth_expired_work);
#ifdef CONFIG_NVME_TARGET_TCP_TLS
sq->tls_key = 0;
sq->tls_key = NULL;
#endif
kfree(sq->dhchap_c1);
sq->dhchap_c1 = NULL;
@@ -600,13 +600,12 @@ void nvmet_auth_insert_psk(struct nvmet_sq *sq)
pr_warn("%s: ctrl %d qid %d failed to refresh key, error %ld\n",
__func__, sq->ctrl->cntlid, sq->qid, PTR_ERR(tls_key));
tls_key = NULL;
kfree_sensitive(tls_psk);
}
if (sq->ctrl->tls_key)
key_put(sq->ctrl->tls_key);
sq->ctrl->tls_key = tls_key;
#endif
kfree_sensitive(tls_psk);
out_free_digest:
kfree_sensitive(digest);
out_free_psk:

View File

@@ -324,6 +324,9 @@ int nvmet_enable_port(struct nvmet_port *port)
lockdep_assert_held(&nvmet_config_sem);
if (port->disc_addr.trtype == NVMF_TRTYPE_MAX)
return -EINVAL;
ops = nvmet_transports[port->disc_addr.trtype];
if (!ops) {
up_write(&nvmet_config_sem);

View File

@@ -1648,16 +1648,17 @@ static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl,
{
struct nvmet_pci_epf_iod *iod;
int ret, n = 0;
u16 head = sq->head;
sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
while (sq->head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) {
while (head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) {
iod = nvmet_pci_epf_alloc_iod(sq);
if (!iod)
break;
/* Get the NVMe command submitted by the host. */
ret = nvmet_pci_epf_transfer(ctrl, &iod->cmd,
sq->pci_addr + sq->head * sq->qes,
sq->pci_addr + head * sq->qes,
sq->qes, DMA_FROM_DEVICE);
if (ret) {
/* Not much we can do... */
@@ -1666,12 +1667,13 @@ static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl,
}
dev_dbg(ctrl->dev, "SQ[%u]: head %u, tail %u, command %s\n",
sq->qid, sq->head, sq->tail,
sq->qid, head, sq->tail,
nvmet_pci_epf_iod_name(iod));
sq->head++;
if (sq->head == sq->depth)
sq->head = 0;
head++;
if (head == sq->depth)
head = 0;
WRITE_ONCE(sq->head, head);
n++;
queue_work_on(WORK_CPU_UNBOUND, sq->iod_wq, &iod->work);
@@ -1761,8 +1763,17 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work)
if (!iod)
break;
/* Post the IOD completion entry. */
/*
* Post the IOD completion entry. If the IOD request was
* executed (req->execute() called), the CQE is already
* initialized. However, the IOD may have been failed before
* that, leaving the CQE not properly initialized. So always
* initialize it here.
*/
cqe = &iod->cqe;
cqe->sq_head = cpu_to_le16(READ_ONCE(iod->sq->head));
cqe->sq_id = cpu_to_le16(iod->sq->qid);
cqe->command_id = iod->cmd.common.command_id;
cqe->status = cpu_to_le16((iod->status << 1) | cq->phase);
dev_dbg(ctrl->dev,
@@ -1800,6 +1811,21 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work)
NVMET_PCI_EPF_CQ_RETRY_INTERVAL);
}
static void nvmet_pci_epf_clear_ctrl_config(struct nvmet_pci_epf_ctrl *ctrl)
{
struct nvmet_ctrl *tctrl = ctrl->tctrl;
/* Initialize controller status. */
tctrl->csts = 0;
ctrl->csts = 0;
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
/* Initialize controller configuration and start polling. */
tctrl->cc = 0;
ctrl->cc = 0;
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
}
static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
{
u64 pci_addr, asq, acq;
@@ -1865,18 +1891,20 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
return 0;
err:
ctrl->csts = 0;
nvmet_pci_epf_clear_ctrl_config(ctrl);
return -EINVAL;
}
static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl,
bool shutdown)
{
int qid;
if (!ctrl->enabled)
return;
dev_info(ctrl->dev, "Disabling controller\n");
dev_info(ctrl->dev, "%s controller\n",
shutdown ? "Shutting down" : "Disabling");
ctrl->enabled = false;
cancel_delayed_work_sync(&ctrl->poll_sqs);
@@ -1893,6 +1921,11 @@ static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
ctrl->csts &= ~NVME_CSTS_RDY;
if (shutdown) {
ctrl->csts |= NVME_CSTS_SHST_CMPLT;
ctrl->cc &= ~NVME_CC_ENABLE;
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
}
}
static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
@@ -1919,12 +1952,10 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
}
if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc))
nvmet_pci_epf_disable_ctrl(ctrl);
nvmet_pci_epf_disable_ctrl(ctrl, false);
if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) {
nvmet_pci_epf_disable_ctrl(ctrl);
ctrl->csts |= NVME_CSTS_SHST_CMPLT;
}
if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc))
nvmet_pci_epf_disable_ctrl(ctrl, true);
if (!nvmet_cc_shn(new_cc) && nvmet_cc_shn(old_cc))
ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
@@ -1963,16 +1994,10 @@ static void nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl *ctrl)
/* Clear Controller Memory Buffer Supported (CMBS). */
ctrl->cap &= ~(0x1ULL << 57);
/* Controller configuration. */
ctrl->cc = tctrl->cc & (~NVME_CC_ENABLE);
/* Controller status. */
ctrl->csts = ctrl->tctrl->csts;
nvmet_pci_epf_bar_write64(ctrl, NVME_REG_CAP, ctrl->cap);
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_VS, tctrl->subsys->ver);
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
nvmet_pci_epf_clear_ctrl_config(ctrl);
}
static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf,
@@ -2070,14 +2095,22 @@ static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf,
static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
{
dev_info(ctrl->dev, "PCI link up\n");
ctrl->link_up = true;
schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
}
static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
{
dev_info(ctrl->dev, "PCI link down\n");
ctrl->link_up = false;
cancel_delayed_work_sync(&ctrl->poll_cc);
nvmet_pci_epf_disable_ctrl(ctrl);
nvmet_pci_epf_disable_ctrl(ctrl, false);
nvmet_pci_epf_clear_ctrl_config(ctrl);
}
static void nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
@@ -2300,10 +2333,8 @@ static int nvmet_pci_epf_epc_init(struct pci_epf *epf)
if (ret)
goto out_clear_bar;
if (!epc_features->linkup_notifier) {
ctrl->link_up = true;
if (!epc_features->linkup_notifier)
nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl);
}
return 0;
@@ -2319,7 +2350,6 @@ static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf)
struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
ctrl->link_up = false;
nvmet_pci_epf_destroy_ctrl(ctrl);
nvmet_pci_epf_deinit_dma(nvme_epf);
@@ -2331,7 +2361,6 @@ static int nvmet_pci_epf_link_up(struct pci_epf *epf)
struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
ctrl->link_up = true;
nvmet_pci_epf_start_ctrl(ctrl);
return 0;
@@ -2342,7 +2371,6 @@ static int nvmet_pci_epf_link_down(struct pci_epf *epf)
struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
ctrl->link_up = false;
nvmet_pci_epf_stop_ctrl(ctrl);
return 0;

View File

@@ -1560,6 +1560,9 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
{
struct socket *sock = queue->sock;
if (!queue->state_change)
return;
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_data_ready = queue->data_ready;
sock->sk->sk_state_change = queue->state_change;

View File

@@ -1614,6 +1614,7 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}
int bdev_validate_blocksize(struct block_device *bdev, int block_size);
int set_blocksize(struct file *file, int size);
int lookup_bdev(const char *pathname, dev_t *dev);
@@ -1670,10 +1671,6 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder,
const struct blk_holder_ops *hops);
void bd_abort_claiming(struct block_device *bdev, void *holder);
/* just for blk-cgroup, don't use elsewhere */
struct block_device *blkdev_get_no_open(dev_t dev);
void blkdev_put_no_open(struct block_device *bdev);
struct block_device *I_BDEV(struct inode *inode);
struct block_device *file_bdev(struct file *bdev_file);
bool disk_live(struct gendisk *disk);

View File

@@ -6,6 +6,10 @@ LDLIBS += -lpthread -lm -luring
TEST_PROGS := test_generic_01.sh
TEST_PROGS += test_generic_02.sh
TEST_PROGS += test_generic_03.sh
TEST_PROGS += test_generic_04.sh
TEST_PROGS += test_generic_05.sh
TEST_PROGS += test_generic_06.sh
TEST_PROGS += test_generic_07.sh
TEST_PROGS += test_null_01.sh
TEST_PROGS += test_null_02.sh
@@ -21,12 +25,16 @@ TEST_PROGS += test_stripe_04.sh
TEST_PROGS += test_stress_01.sh
TEST_PROGS += test_stress_02.sh
TEST_PROGS += test_stress_03.sh
TEST_PROGS += test_stress_04.sh
TEST_PROGS += test_stress_05.sh
TEST_GEN_PROGS_EXTENDED = kublk
include ../lib.mk
$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c
$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c \
fault_inject.c
check:
shellcheck -x -f gcc *.sh

View File

@@ -0,0 +1,98 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Fault injection ublk target. Hack this up however you like for
* testing specific behaviors of ublk_drv. Currently is a null target
* with a configurable delay before completing each I/O. This delay can
* be used to test ublk_drv's handling of I/O outstanding to the ublk
* server when it dies.
*/
#include "kublk.h"
static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
struct ublk_dev *dev)
{
const struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
unsigned long dev_size = 250UL << 30;
dev->tgt.dev_size = dev_size;
dev->tgt.params = (struct ublk_params) {
.types = UBLK_PARAM_TYPE_BASIC,
.basic = {
.logical_bs_shift = 9,
.physical_bs_shift = 12,
.io_opt_shift = 12,
.io_min_shift = 9,
.max_sectors = info->max_io_buf_bytes >> 9,
.dev_sectors = dev_size >> 9,
},
};
dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
return 0;
}
static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
{
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe;
struct __kernel_timespec ts = {
.tv_nsec = (long long)q->dev->private_data,
};
ublk_queue_alloc_sqes(q, &sqe, 1);
io_uring_prep_timeout(sqe, &ts, 1, 0);
sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, 1);
ublk_queued_tgt_io(q, tag, 1);
return 0;
}
static void ublk_fault_inject_tgt_io_done(struct ublk_queue *q, int tag,
const struct io_uring_cqe *cqe)
{
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
if (cqe->res != -ETIME)
ublk_err("%s: unexpected cqe res %d\n", __func__, cqe->res);
if (ublk_completed_tgt_io(q, tag))
ublk_complete_io(q, tag, iod->nr_sectors << 9);
else
ublk_err("%s: io not complete after 1 cqe\n", __func__);
}
static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv[])
{
static const struct option longopts[] = {
{ "delay_us", 1, NULL, 0 },
{ 0, 0, 0, 0 }
};
int option_idx, opt;
ctx->fault_inject.delay_us = 0;
while ((opt = getopt_long(argc, argv, "",
longopts, &option_idx)) != -1) {
switch (opt) {
case 0:
if (!strcmp(longopts[option_idx].name, "delay_us"))
ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10);
}
}
}
static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops)
{
printf("\tfault_inject: [--delay_us us (default 0)]\n");
}
const struct ublk_tgt_ops fault_inject_tgt_ops = {
.name = "fault_inject",
.init_tgt = ublk_fault_inject_tgt_init,
.queue_io = ublk_fault_inject_queue_io,
.tgt_io_done = ublk_fault_inject_tgt_io_done,
.parse_cmd_line = ublk_fault_inject_cmd_line,
.usage = ublk_fault_inject_usage,
};

View File

@@ -5,22 +5,24 @@
#include "kublk.h"
#define MAX_NR_TGT_ARG 64
unsigned int ublk_dbg_mask = UBLK_LOG;
static const struct ublk_tgt_ops *tgt_ops_list[] = {
&null_tgt_ops,
&loop_tgt_ops,
&stripe_tgt_ops,
&fault_inject_tgt_ops,
};
static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
{
const struct ublk_tgt_ops *ops;
int i;
if (name == NULL)
return NULL;
for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++)
for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
if (strcmp(tgt_ops_list[i]->name, name) == 0)
return tgt_ops_list[i];
return NULL;
@@ -118,6 +120,27 @@ static int ublk_ctrl_start_dev(struct ublk_dev *dev,
return __ublk_ctrl_cmd(dev, &data);
}
static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
{
struct ublk_ctrl_cmd_data data = {
.cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
};
return __ublk_ctrl_cmd(dev, &data);
}
static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
{
struct ublk_ctrl_cmd_data data = {
.cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
.flags = CTRL_CMD_HAS_DATA,
};
dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
return __ublk_ctrl_cmd(dev, &data);
}
static int ublk_ctrl_add_dev(struct ublk_dev *dev)
{
struct ublk_ctrl_cmd_data data = {
@@ -207,10 +230,73 @@ static const char *ublk_dev_state_desc(struct ublk_dev *dev)
};
}
static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
{
unsigned done = 0;
int i;
for (i = 0; i < CPU_SETSIZE; i++) {
if (CPU_ISSET(i, set))
done += snprintf(&buf[done], len - done, "%d ", i);
}
}
static void ublk_adjust_affinity(cpu_set_t *set)
{
int j, updated = 0;
/*
* Just keep the 1st CPU now.
*
* In future, auto affinity selection can be tried.
*/
for (j = 0; j < CPU_SETSIZE; j++) {
if (CPU_ISSET(j, set)) {
if (!updated) {
updated = 1;
continue;
}
CPU_CLR(j, set);
}
}
}
/* Caller must free the allocated buffer */
static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
{
struct ublk_ctrl_cmd_data data = {
.cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
.flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
};
cpu_set_t *buf;
int i, ret;
buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
if (!buf)
return -ENOMEM;
for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
data.data[0] = i;
data.len = sizeof(cpu_set_t);
data.addr = (__u64)&buf[i];
ret = __ublk_ctrl_cmd(ctrl_dev, &data);
if (ret < 0) {
free(buf);
return ret;
}
ublk_adjust_affinity(&buf[i]);
}
*ptr_buf = buf;
return 0;
}
static void ublk_ctrl_dump(struct ublk_dev *dev)
{
struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
struct ublk_params p;
cpu_set_t *affinity;
int ret;
ret = ublk_ctrl_get_params(dev, &p);
@@ -219,12 +305,31 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
return;
}
ret = ublk_ctrl_get_affinity(dev, &affinity);
if (ret < 0) {
ublk_err("failed to get affinity %m\n");
return;
}
ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
info->dev_id, info->nr_hw_queues, info->queue_depth,
1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
ublk_dev_state_desc(dev));
if (affinity) {
char buf[512];
int i;
for (i = 0; i < info->nr_hw_queues; i++) {
ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
printf("\tqueue %u: tid %d affinity(%s)\n",
i, dev->q[i].tid, buf);
}
free(affinity);
}
fflush(stdout);
}
@@ -347,7 +452,9 @@ static int ublk_queue_init(struct ublk_queue *q)
}
ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
IORING_SETUP_COOP_TASKRUN);
IORING_SETUP_COOP_TASKRUN |
IORING_SETUP_SINGLE_ISSUER |
IORING_SETUP_DEFER_TASKRUN);
if (ret < 0) {
ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
q->dev->dev_info.dev_id, q->q_id, ret);
@@ -429,12 +536,17 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
if (!(io->flags & UBLKSRV_IO_FREE))
return 0;
/* we issue because we need either fetching or committing */
/*
* we issue because we need either fetching or committing or
* getting data
*/
if (!(io->flags &
(UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP)))
(UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_NEED_GET_DATA)))
return 0;
if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
if (io->flags & UBLKSRV_NEED_GET_DATA)
cmd_op = UBLK_U_IO_NEED_GET_DATA;
else if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
cmd_op = UBLK_U_IO_FETCH_REQ;
@@ -551,6 +663,9 @@ static void ublk_handle_cqe(struct io_uring *r,
assert(tag < q->q_depth);
if (q->tgt_ops->queue_io)
q->tgt_ops->queue_io(q, tag);
} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
ublk_queue_io_cmd(q, io, tag);
} else {
/*
* COMMIT_REQ will be completed immediately since no fetching
@@ -602,9 +717,24 @@ static int ublk_process_io(struct ublk_queue *q)
return reapped;
}
static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
cpu_set_t *cpuset)
{
if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
ublk_err("ublk dev %u queue %u set affinity failed",
q->dev->dev_info.dev_id, q->q_id);
}
struct ublk_queue_info {
struct ublk_queue *q;
sem_t *queue_sem;
cpu_set_t *affinity;
};
static void *ublk_io_handler_fn(void *data)
{
struct ublk_queue *q = data;
struct ublk_queue_info *info = data;
struct ublk_queue *q = info->q;
int dev_id = q->dev->dev_info.dev_id;
int ret;
@@ -614,6 +744,10 @@ static void *ublk_io_handler_fn(void *data)
dev_id, q->q_id);
return NULL;
}
/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
ublk_queue_set_sched_affinity(q, info->affinity);
sem_post(info->queue_sem);
ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
q->tid, dev_id, q->q_id);
@@ -639,7 +773,7 @@ static void ublk_set_parameters(struct ublk_dev *dev)
dev->dev_info.dev_id, ret);
}
static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id)
static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
{
uint64_t id;
int evtfd = ctx->_evtfd;
@@ -652,36 +786,68 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id)
else
id = ERROR_EVTFD_DEVID;
if (dev && ctx->shadow_dev)
memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
if (write(evtfd, &id, sizeof(id)) != sizeof(id))
return -EINVAL;
close(evtfd);
shmdt(ctx->shadow_dev);
return 0;
}
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
{
int ret, i;
void *thread_ret;
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
struct ublk_queue_info *qinfo;
cpu_set_t *affinity_buf;
void *thread_ret;
sem_t queue_sem;
int ret, i;
ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
dinfo->nr_hw_queues);
if (!qinfo)
return -ENOMEM;
sem_init(&queue_sem, 0, 0);
ret = ublk_dev_prep(ctx, dev);
if (ret)
return ret;
ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
if (ret)
return ret;
for (i = 0; i < dinfo->nr_hw_queues; i++) {
dev->q[i].dev = dev;
dev->q[i].q_id = i;
qinfo[i].q = &dev->q[i];
qinfo[i].queue_sem = &queue_sem;
qinfo[i].affinity = &affinity_buf[i];
pthread_create(&dev->q[i].thread, NULL,
ublk_io_handler_fn,
&dev->q[i]);
&qinfo[i]);
}
for (i = 0; i < dinfo->nr_hw_queues; i++)
sem_wait(&queue_sem);
free(qinfo);
free(affinity_buf);
/* everything is fine now, start us */
ublk_set_parameters(dev);
ret = ublk_ctrl_start_dev(dev, getpid());
if (ctx->recovery)
ret = ublk_ctrl_end_user_recovery(dev, getpid());
else {
ublk_set_parameters(dev);
ret = ublk_ctrl_start_dev(dev, getpid());
}
if (ret < 0) {
ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
goto fail;
@@ -691,7 +857,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
if (ctx->fg)
ublk_ctrl_dump(dev);
else
ublk_send_dev_event(ctx, dev->dev_info.dev_id);
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
/* wait until we are terminated */
for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -856,7 +1022,10 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
}
}
ret = ublk_ctrl_add_dev(dev);
if (ctx->recovery)
ret = ublk_ctrl_start_user_recovery(dev);
else
ret = ublk_ctrl_add_dev(dev);
if (ret < 0) {
ublk_err("%s: can't add dev id %d, type %s ret %d\n",
__func__, dev_id, tgt_type, ret);
@@ -870,7 +1039,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
fail:
if (ret < 0)
ublk_send_dev_event(ctx, -1);
ublk_send_dev_event(ctx, dev, -1);
ublk_ctrl_deinit(dev);
return ret;
}
@@ -884,30 +1053,58 @@ static int cmd_dev_add(struct dev_ctx *ctx)
if (ctx->fg)
goto run;
ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
if (ctx->_shmid < 0) {
ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
exit(-1);
}
ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
if (ctx->shadow_dev == (struct ublk_dev *)-1) {
ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
exit(-1);
}
ctx->_evtfd = eventfd(0, 0);
if (ctx->_evtfd < 0) {
ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
exit(-1);
}
setsid();
res = fork();
if (res == 0) {
int res2;
setsid();
res2 = fork();
if (res2 == 0) {
/* prepare for detaching */
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
run:
res = __cmd_dev_add(ctx);
return res;
res = __cmd_dev_add(ctx);
return res;
} else {
/* detached from the foreground task */
exit(EXIT_SUCCESS);
}
} else if (res > 0) {
uint64_t id;
int exit_code = EXIT_FAILURE;
res = read(ctx->_evtfd, &id, sizeof(id));
close(ctx->_evtfd);
if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
ctx->dev_id = id - 1;
return __cmd_dev_list(ctx);
if (__cmd_dev_list(ctx) >= 0)
exit_code = EXIT_SUCCESS;
}
exit(EXIT_FAILURE);
shmdt(ctx->shadow_dev);
shmctl(ctx->_shmid, IPC_RMID, NULL);
/* wait for child and detach from it */
wait(NULL);
exit(exit_code);
} else {
return res;
exit(EXIT_FAILURE);
}
}
@@ -969,6 +1166,9 @@ static int __cmd_dev_list(struct dev_ctx *ctx)
ublk_err("%s: can't get dev info from %d: %d\n",
__func__, ctx->dev_id, ret);
} else {
if (ctx->shadow_dev)
memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
ublk_ctrl_dump(dev);
}
@@ -1039,14 +1239,47 @@ static int cmd_dev_get_features(void)
return ret;
}
static void __cmd_create_help(char *exe, bool recovery)
{
int i;
printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
exe, recovery ? "recover" : "add");
printf("\t[--foreground] [--quiet] [-z] [--debug_mask mask] [-r 0|1 ] [-g]\n");
printf("\t[-e 0|1 ] [-i 0|1]\n");
printf("\t[target options] [backfile1] [backfile2] ...\n");
printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
const struct ublk_tgt_ops *ops = tgt_ops_list[i];
if (ops->usage)
ops->usage(ops);
}
}
static void cmd_add_help(char *exe)
{
__cmd_create_help(exe, false);
printf("\n");
}
static void cmd_recover_help(char *exe)
{
__cmd_create_help(exe, true);
printf("\tPlease provide exact command line for creating this device with real dev_id\n");
printf("\n");
}
static int cmd_dev_help(char *exe)
{
printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe);
printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n");
cmd_add_help(exe);
cmd_recover_help(exe);
printf("%s del [-n dev_id] -a \n", exe);
printf("\t -a delete all devices -n delete specified device\n");
printf("\t -a delete all devices -n delete specified device\n\n");
printf("%s list [-n dev_id] -a \n", exe);
printf("\t -a list all devices, -n list specified device, default -a \n");
printf("\t -a list all devices, -n list specified device, default -a \n\n");
printf("%s features\n", exe);
return 0;
}
@@ -1063,9 +1296,13 @@ int main(int argc, char *argv[])
{ "quiet", 0, NULL, 0 },
{ "zero_copy", 0, NULL, 'z' },
{ "foreground", 0, NULL, 0 },
{ "chunk_size", 1, NULL, 0 },
{ "recovery", 1, NULL, 'r' },
{ "recovery_fail_io", 1, NULL, 'e'},
{ "recovery_reissue", 1, NULL, 'i'},
{ "get_data", 1, NULL, 'g'},
{ 0, 0, 0, 0 }
};
const struct ublk_tgt_ops *ops = NULL;
int option_idx, opt;
const char *cmd = argv[1];
struct dev_ctx ctx = {
@@ -1073,15 +1310,18 @@ int main(int argc, char *argv[])
.nr_hw_queues = 2,
.dev_id = -1,
.tgt_type = "unknown",
.chunk_size = 65536, /* def chunk size is 64K */
};
int ret = -EINVAL, i;
int tgt_argc = 1;
char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
int value;
if (argc == 1)
return ret;
opterr = 0;
optind = 2;
while ((opt = getopt_long(argc, argv, "t:n:d:q:az",
while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:gaz",
longopts, &option_idx)) != -1) {
switch (opt) {
case 'a':
@@ -1103,6 +1343,24 @@ int main(int argc, char *argv[])
case 'z':
ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
break;
case 'r':
value = strtol(optarg, NULL, 10);
if (value)
ctx.flags |= UBLK_F_USER_RECOVERY;
break;
case 'e':
value = strtol(optarg, NULL, 10);
if (value)
ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
break;
case 'i':
value = strtol(optarg, NULL, 10);
if (value)
ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
break;
case 'g':
ctx.flags |= UBLK_F_NEED_GET_DATA;
break;
case 0:
if (!strcmp(longopts[option_idx].name, "debug_mask"))
ublk_dbg_mask = strtol(optarg, NULL, 16);
@@ -1110,8 +1368,26 @@ int main(int argc, char *argv[])
ublk_dbg_mask = 0;
if (!strcmp(longopts[option_idx].name, "foreground"))
ctx.fg = 1;
if (!strcmp(longopts[option_idx].name, "chunk_size"))
ctx.chunk_size = strtol(optarg, NULL, 10);
break;
case '?':
/*
* target requires every option must have argument
*/
if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
fprintf(stderr, "every target option requires argument: %s %s\n",
argv[optind - 1], argv[optind]);
exit(EXIT_FAILURE);
}
if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
tgt_argv[tgt_argc++] = argv[optind - 1];
tgt_argv[tgt_argc++] = argv[optind];
} else {
fprintf(stderr, "too many target options\n");
exit(EXIT_FAILURE);
}
optind += 1;
break;
}
}
@@ -1120,9 +1396,25 @@ int main(int argc, char *argv[])
ctx.files[ctx.nr_files++] = argv[i++];
}
ops = ublk_find_tgt(ctx.tgt_type);
if (ops && ops->parse_cmd_line) {
optind = 0;
tgt_argv[0] = ctx.tgt_type;
ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
}
if (!strcmp(cmd, "add"))
ret = cmd_dev_add(&ctx);
else if (!strcmp(cmd, "del"))
else if (!strcmp(cmd, "recover")) {
if (ctx.dev_id < 0) {
fprintf(stderr, "device id isn't provided for recovering\n");
ret = -EINVAL;
} else {
ctx.recovery = 1;
ret = cmd_dev_add(&ctx);
}
} else if (!strcmp(cmd, "del"))
ret = cmd_dev_del(&ctx);
else if (!strcmp(cmd, "list")) {
ctx.all = 1;

View File

@@ -20,9 +20,15 @@
#include <sys/wait.h>
#include <sys/eventfd.h>
#include <sys/uio.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <linux/io_uring.h>
#include <liburing.h>
#include <linux/ublk_cmd.h>
#include <semaphore.h>
/* allow ublk_dep.h to override ublk_cmd.h */
#include "ublk_dep.h"
#include <linux/ublk_cmd.h>
#define __maybe_unused __attribute__((unused))
#define MAX_BACK_FILES 4
@@ -30,6 +36,8 @@
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
/****************** part 1: libublk ********************/
#define CTRL_DEV "/dev/ublk-control"
@@ -42,8 +50,8 @@
#define UBLKSRV_IO_IDLE_SECS 20
#define UBLK_IO_MAX_BYTES (1 << 20)
#define UBLK_MAX_QUEUES 4
#define UBLK_QUEUE_DEPTH 128
#define UBLK_MAX_QUEUES 32
#define UBLK_QUEUE_DEPTH 1024
#define UBLK_DBG_DEV (1U << 0)
#define UBLK_DBG_QUEUE (1U << 1)
@@ -55,6 +63,16 @@
struct ublk_dev;
struct ublk_queue;
struct stripe_ctx {
/* stripe */
unsigned int chunk_size;
};
struct fault_inject_ctx {
/* fault_inject */
unsigned long delay_us;
};
struct dev_ctx {
char tgt_type[16];
unsigned long flags;
@@ -66,11 +84,18 @@ struct dev_ctx {
unsigned int logging:1;
unsigned int all:1;
unsigned int fg:1;
/* stripe */
unsigned int chunk_size;
unsigned int recovery:1;
int _evtfd;
int _shmid;
/* built from shmem, only for ublk_dump_dev() */
struct ublk_dev *shadow_dev;
union {
struct stripe_ctx stripe;
struct fault_inject_ctx fault_inject;
};
};
struct ublk_ctrl_cmd_data {
@@ -90,6 +115,7 @@ struct ublk_io {
#define UBLKSRV_NEED_FETCH_RQ (1UL << 0)
#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1)
#define UBLKSRV_IO_FREE (1UL << 2)
#define UBLKSRV_NEED_GET_DATA (1UL << 3)
unsigned short flags;
unsigned short refs; /* used by target code only */
@@ -107,6 +133,14 @@ struct ublk_tgt_ops {
int (*queue_io)(struct ublk_queue *, int tag);
void (*tgt_io_done)(struct ublk_queue *,
int tag, const struct io_uring_cqe *);
/*
* Target specific command line handling
*
* each option requires argument for target command line
*/
void (*parse_cmd_line)(struct dev_ctx *ctx, int argc, char *argv[]);
void (*usage)(const struct ublk_tgt_ops *ops);
};
struct ublk_tgt {
@@ -357,6 +391,7 @@ static inline int ublk_queue_use_zc(const struct ublk_queue *q)
extern const struct ublk_tgt_ops null_tgt_ops;
extern const struct ublk_tgt_ops loop_tgt_ops;
extern const struct ublk_tgt_ops stripe_tgt_ops;
extern const struct ublk_tgt_ops fault_inject_tgt_ops;
void backing_file_tgt_deinit(struct ublk_dev *dev);
int backing_file_tgt_init(struct ublk_dev *dev);

View File

@@ -281,7 +281,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
.max_sectors = dev->dev_info.max_io_buf_bytes >> 9,
},
};
unsigned chunk_size = ctx->chunk_size;
unsigned chunk_size = ctx->stripe.chunk_size;
struct stripe_conf *conf;
unsigned chunk_shift;
loff_t bytes = 0;
@@ -344,10 +344,36 @@ static void ublk_stripe_tgt_deinit(struct ublk_dev *dev)
backing_file_tgt_deinit(dev);
}
static void ublk_stripe_cmd_line(struct dev_ctx *ctx, int argc, char *argv[])
{
static const struct option longopts[] = {
{ "chunk_size", 1, NULL, 0 },
{ 0, 0, 0, 0 }
};
int option_idx, opt;
ctx->stripe.chunk_size = 65536;
while ((opt = getopt_long(argc, argv, "",
longopts, &option_idx)) != -1) {
switch (opt) {
case 0:
if (!strcmp(longopts[option_idx].name, "chunk_size"))
ctx->stripe.chunk_size = strtol(optarg, NULL, 10);
}
}
}
static void ublk_stripe_usage(const struct ublk_tgt_ops *ops)
{
printf("\tstripe: [--chunk_size chunk_size (default 65536)]\n");
}
const struct ublk_tgt_ops stripe_tgt_ops = {
.name = "stripe",
.init_tgt = ublk_stripe_tgt_init,
.deinit_tgt = ublk_stripe_tgt_deinit,
.queue_io = ublk_stripe_queue_io,
.tgt_io_done = ublk_stripe_io_done,
.parse_cmd_line = ublk_stripe_cmd_line,
.usage = ublk_stripe_usage,
};

View File

@@ -17,8 +17,8 @@ _get_disk_dev_t() {
local minor
dev=/dev/ublkb"${dev_id}"
major=$(stat -c '%Hr' "$dev")
minor=$(stat -c '%Lr' "$dev")
major="0x"$(stat -c '%t' "$dev")
minor="0x"$(stat -c '%T' "$dev")
echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) ))
}
@@ -30,18 +30,26 @@ _run_fio_verify_io() {
}
_create_backfile() {
local my_size=$1
local my_file
local index=$1
local new_size=$2
local old_file
local new_file
my_file=$(mktemp ublk_file_"${my_size}"_XXXXX)
truncate -s "${my_size}" "${my_file}"
echo "$my_file"
old_file="${UBLK_BACKFILES[$index]}"
[ -f "$old_file" ] && rm -f "$old_file"
new_file=$(mktemp ublk_file_"${new_size}"_XXXXX)
truncate -s "${new_size}" "${new_file}"
UBLK_BACKFILES["$index"]="$new_file"
}
_remove_backfile() {
local file=$1
_remove_files() {
local file
[ -f "$file" ] && rm -f "$file"
for file in "${UBLK_BACKFILES[@]}"; do
[ -f "$file" ] && rm -f "$file"
done
[ -f "$UBLK_TMP" ] && rm -f "$UBLK_TMP"
}
_create_tmp_dir() {
@@ -106,6 +114,7 @@ _prep_test() {
local type=$1
shift 1
modprobe ublk_drv > /dev/null 2>&1
UBLK_TMP=$(mktemp ublk_test_XXXXX)
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
}
@@ -129,7 +138,10 @@ _show_result()
echo "$1 : [FAIL]"
fi
fi
[ "$2" -ne 0 ] && exit "$2"
if [ "$2" -ne 0 ]; then
_remove_files
exit "$2"
fi
return 0
}
@@ -138,16 +150,16 @@ _check_add_dev()
{
local tid=$1
local code=$2
shift 2
if [ "${code}" -ne 0 ]; then
_remove_test_files "$@"
_show_result "${tid}" "${code}"
fi
}
_cleanup_test() {
"${UBLK_PROG}" del -a
rm -f "$UBLK_TMP"
_remove_files
}
_have_feature()
@@ -158,9 +170,11 @@ _have_feature()
return 1
}
_add_ublk_dev() {
local kublk_temp;
_create_ublk_dev() {
local dev_id;
local cmd=$1
shift 1
if [ ! -c /dev/ublk-control ]; then
return ${UBLK_SKIP_CODE}
@@ -171,17 +185,34 @@ _add_ublk_dev() {
fi
fi
kublk_temp=$(mktemp /tmp/kublk-XXXXXX)
if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then
if ! dev_id=$("${UBLK_PROG}" "$cmd" "$@" | grep "dev id" | awk -F '[ :]' '{print $3}'); then
echo "fail to add ublk dev $*"
rm -f "${kublk_temp}"
return 255
fi
dev_id=$(grep "dev id" "${kublk_temp}" | awk -F '[ :]' '{print $3}')
udevadm settle
rm -f "${kublk_temp}"
echo "${dev_id}"
if [[ "$dev_id" =~ ^[0-9]+$ ]]; then
echo "${dev_id}"
else
return 255
fi
}
_add_ublk_dev() {
_create_ublk_dev "add" "$@"
}
_recover_ublk_dev() {
local dev_id
local state
dev_id=$(_create_ublk_dev "recover" "$@")
for ((j=0;j<20;j++)); do
state=$(_get_ublk_dev_state "${dev_id}")
[ "$state" == "LIVE" ] && break
sleep 1
done
echo "$state"
}
# kill the ublk daemon and return ublk device state
@@ -220,7 +251,7 @@ __run_io_and_remove()
local kill_server=$3
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
--rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \
--rw=readwrite --iodepth=256 --size="${size}" --numjobs=4 \
--runtime=20 --time_based > /dev/null 2>&1 &
sleep 2
if [ "${kill_server}" = "yes" ]; then
@@ -238,15 +269,80 @@ __run_io_and_remove()
wait
}
run_io_and_remove()
{
local size=$1
local dev_id
shift 1
dev_id=$(_add_ublk_dev "$@")
_check_add_dev "$TID" $?
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)"
if ! __run_io_and_remove "$dev_id" "${size}" "no"; then
echo "/dev/ublkc$dev_id isn't removed"
exit 255
fi
}
run_io_and_kill_daemon()
{
local size=$1
local dev_id
shift 1
dev_id=$(_add_ublk_dev "$@")
_check_add_dev "$TID" $?
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)"
if ! __run_io_and_remove "$dev_id" "${size}" "yes"; then
echo "/dev/ublkc$dev_id isn't removed res ${res}"
exit 255
fi
}
run_io_and_recover()
{
local state
local dev_id
dev_id=$(_add_ublk_dev "$@")
_check_add_dev "$TID" $?
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
--rw=readwrite --iodepth=256 --size="${size}" --numjobs=4 \
--runtime=20 --time_based > /dev/null 2>&1 &
sleep 4
state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
if [ "$state" != "QUIESCED" ]; then
echo "device isn't quiesced($state) after killing daemon"
return 255
fi
state=$(_recover_ublk_dev -n "$dev_id" "$@")
if [ "$state" != "LIVE" ]; then
echo "faile to recover to LIVE($state)"
return 255
fi
if ! __remove_ublk_dev_return "${dev_id}"; then
echo "delete dev ${dev_id} failed"
return 255
fi
wait
}
_ublk_test_top_dir()
{
cd "$(dirname "$0")" && pwd
}
UBLK_TMP=$(mktemp ublk_test_XXXXX)
UBLK_PROG=$(_ublk_test_top_dir)/kublk
UBLK_TEST_QUIET=1
UBLK_TEST_SHOW_RESULT=1
UBLK_BACKFILES=()
export UBLK_PROG
export UBLK_TEST_QUIET
export UBLK_TEST_SHOW_RESULT

View File

@@ -0,0 +1,40 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="generic_04"
ERR_CODE=0
ublk_run_recover_test()
{
run_io_and_recover "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "recover" "basic recover function verification"
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
ublk_run_recover_test -t null -q 2 -r 1 &
ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
ublk_run_recover_test -t null -q 2 -r 1 -i 1 &
ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
_cleanup_test "recover"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,44 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="generic_05"
ERR_CODE=0
ublk_run_recover_test()
{
run_io_and_recover "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
if ! _have_feature "ZERO_COPY"; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "recover" "basic recover function verification (zero copy)"
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
ublk_run_recover_test -t null -q 2 -r 1 -z &
ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" &
ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 &
ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" &
ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
_cleanup_test "recover"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,41 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="generic_06"
ERR_CODE=0
_prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server"
# configure ublk server to sleep 2s before completing each I/O
dev_id=$(_add_ublk_dev -t fault_inject -q 2 -d 1 --delay_us 2000000)
_check_add_dev $TID $?
STARTTIME=${SECONDS}
dd if=/dev/urandom of=/dev/ublkb${dev_id} oflag=direct bs=4k count=1 status=none > /dev/null 2>&1 &
dd_pid=$!
__ublk_kill_daemon ${dev_id} "DEAD"
wait $dd_pid
dd_exitcode=$?
ENDTIME=${SECONDS}
ELAPSED=$(($ENDTIME - $STARTTIME))
# assert that dd sees an error and exits quickly after ublk server is
# killed. previously this relied on seeing an I/O timeout and so would
# take ~30s
if [ $dd_exitcode -eq 0 ]; then
echo "dd unexpectedly exited successfully!"
ERR_CODE=255
fi
if [ $ELAPSED -ge 5 ]; then
echo "dd took $ELAPSED seconds to exit (>= 5s tolerance)!"
ERR_CODE=255
fi
_cleanup_test "fault_inject"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,28 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="generic_07"
ERR_CODE=0
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "generic" "test UBLK_F_NEED_GET_DATA"
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -t loop -q 2 -g "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M
ERR_CODE=$?
if [ "$ERR_CODE" -eq 0 ]; then
_mkfs_mount_test /dev/ublkb"${dev_id}"
ERR_CODE=$?
fi
_cleanup_test "generic"
_show_result $TID $ERR_CODE

View File

@@ -12,10 +12,10 @@ fi
_prep_test "loop" "write and verify test"
backfile_0=$(_create_backfile 256M)
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -t loop "$backfile_0")
_check_add_dev $TID $? "${backfile_0}"
dev_id=$(_add_ublk_dev -t loop "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M
@@ -23,6 +23,4 @@ ERR_CODE=$?
_cleanup_test "loop"
_remove_backfile "$backfile_0"
_show_result $TID $ERR_CODE

View File

@@ -8,15 +8,13 @@ ERR_CODE=0
_prep_test "loop" "mkfs & mount & umount"
backfile_0=$(_create_backfile 256M)
dev_id=$(_add_ublk_dev -t loop "$backfile_0")
_check_add_dev $TID $? "$backfile_0"
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -t loop "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
_mkfs_mount_test /dev/ublkb"${dev_id}"
ERR_CODE=$?
_cleanup_test "loop"
_remove_backfile "$backfile_0"
_show_result $TID $ERR_CODE

View File

@@ -12,9 +12,9 @@ fi
_prep_test "loop" "write and verify over zero copy"
backfile_0=$(_create_backfile 256M)
dev_id=$(_add_ublk_dev -t loop -z "$backfile_0")
_check_add_dev $TID $? "$backfile_0"
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -t loop -z "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M
@@ -22,6 +22,4 @@ ERR_CODE=$?
_cleanup_test "loop"
_remove_backfile "$backfile_0"
_show_result $TID $ERR_CODE

View File

@@ -8,15 +8,14 @@ ERR_CODE=0
_prep_test "loop" "mkfs & mount & umount with zero copy"
backfile_0=$(_create_backfile 256M)
dev_id=$(_add_ublk_dev -t loop -z "$backfile_0")
_check_add_dev $TID $? "$backfile_0"
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -t loop -z "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
_mkfs_mount_test /dev/ublkb"${dev_id}"
ERR_CODE=$?
_cleanup_test "loop"
_remove_backfile "$backfile_0"
_show_result $TID $ERR_CODE

View File

@@ -12,10 +12,10 @@ fi
_prep_test "loop" "write and verify test"
backfile_0=$(_create_backfile 256M)
_create_backfile 0 256M
dev_id=$(_add_ublk_dev -q 2 -t loop "$backfile_0")
_check_add_dev $TID $? "${backfile_0}"
dev_id=$(_add_ublk_dev -q 2 -t loop "${UBLK_BACKFILES[0]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=256M
@@ -23,6 +23,4 @@ ERR_CODE=$?
_cleanup_test "loop"
_remove_backfile "$backfile_0"
_show_result $TID $ERR_CODE

View File

@@ -4,44 +4,31 @@
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="stress_01"
ERR_CODE=0
DEV_ID=-1
ublk_io_and_remove()
{
local size=$1
shift 1
local backfile=""
if echo "$@" | grep -q "loop"; then
backfile=${*: -1}
fi
DEV_ID=$(_add_ublk_dev "$@")
_check_add_dev $TID $? "${backfile}"
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)"
if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then
echo "/dev/ublkc${DEV_ID} isn't removed"
_remove_backfile "${backfile}"
exit 255
run_io_and_remove "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "stress" "run IO and remove device"
ublk_io_and_remove 8G -t null -q 4
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
_show_result $TID $ERR_CODE
fi
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
BACK_FILE=$(_create_backfile 256M)
ublk_io_and_remove 256M -t loop -q 4 "${BACK_FILE}"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
_show_result $TID $ERR_CODE
fi
ublk_io_and_remove 8G -t null -q 4 &
ublk_io_and_remove 256M -t loop -q 4 "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
ublk_io_and_remove 256M -t loop -q 4 -z "${BACK_FILE}"
ERR_CODE=$?
_cleanup_test "stress"
_remove_backfile "${BACK_FILE}"
_show_result $TID $ERR_CODE

View File

@@ -4,44 +4,31 @@
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="stress_02"
ERR_CODE=0
DEV_ID=-1
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
ublk_io_and_kill_daemon()
{
local size=$1
shift 1
local backfile=""
if echo "$@" | grep -q "loop"; then
backfile=${*: -1}
fi
DEV_ID=$(_add_ublk_dev "$@")
_check_add_dev $TID $? "${backfile}"
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)"
if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then
echo "/dev/ublkc${DEV_ID} isn't removed res ${res}"
_remove_backfile "${backfile}"
exit 255
run_io_and_kill_daemon "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
_prep_test "stress" "run IO and kill ublk server"
ublk_io_and_kill_daemon 8G -t null -q 4
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
_show_result $TID $ERR_CODE
fi
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
BACK_FILE=$(_create_backfile 256M)
ublk_io_and_kill_daemon 256M -t loop -q 4 "${BACK_FILE}"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
_show_result $TID $ERR_CODE
fi
ublk_io_and_kill_daemon 8G -t null -q 4 &
ublk_io_and_kill_daemon 256M -t loop -q 4 "${UBLK_BACKFILES[0]}" &
ublk_io_and_kill_daemon 256M -t stripe -q 4 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${BACK_FILE}"
ERR_CODE=$?
_cleanup_test "stress"
_remove_backfile "${BACK_FILE}"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,38 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="stress_03"
ERR_CODE=0
ublk_io_and_remove()
{
run_io_and_remove "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
if ! _have_feature "ZERO_COPY"; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "stress" "run IO and remove device(zero copy)"
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
ublk_io_and_remove 8G -t null -q 4 -z &
ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
_cleanup_test "stress"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,37 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="stress_04"
ERR_CODE=0
ublk_io_and_kill_daemon()
{
run_io_and_kill_daemon "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
if ! _have_program fio; then
exit "$UBLK_SKIP_CODE"
fi
if ! _have_feature "ZERO_COPY"; then
exit "$UBLK_SKIP_CODE"
fi
_prep_test "stress" "run IO and kill ublk server(zero copy)"
_create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
ublk_io_and_kill_daemon 8G -t null -q 4 -z &
ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_kill_daemon 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
wait
_cleanup_test "stress"
_show_result $TID $ERR_CODE

View File

@@ -0,0 +1,64 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
TID="stress_05"
ERR_CODE=0
run_io_and_remove()
{
local size=$1
local dev_id
local dev_pid
shift 1
dev_id=$(_add_ublk_dev "$@")
_check_add_dev $TID $?
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)"
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
--rw=readwrite --iodepth=128 --size="${size}" --numjobs=4 \
--runtime=40 --time_based > /dev/null 2>&1 &
sleep 4
dev_pid=$(_get_ublk_daemon_pid "$dev_id")
kill -9 "$dev_pid"
if ! __remove_ublk_dev_return "${dev_id}"; then
echo "delete dev ${dev_id} failed"
return 255
fi
}
ublk_io_and_remove()
{
run_io_and_remove "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
_show_result $TID $ERR_CODE
fi
}
_prep_test "stress" "run IO and remove device with recovery enabled"
_create_backfile 0 256M
_create_backfile 1 256M
for reissue in $(seq 0 1); do
ublk_io_and_remove 8G -t null -q 4 -g -r 1 -i "$reissue" &
ublk_io_and_remove 256M -t loop -q 4 -g -r 1 -i "$reissue" "${UBLK_BACKFILES[0]}" &
wait
done
if _have_feature "ZERO_COPY"; then
for reissue in $(seq 0 1); do
ublk_io_and_remove 8G -t null -q 4 -g -z -r 1 -i "$reissue" &
ublk_io_and_remove 256M -t loop -q 4 -g -z -r 1 -i "$reissue" "${UBLK_BACKFILES[1]}" &
wait
done
fi
_cleanup_test "stress"
_show_result $TID $ERR_CODE

View File

@@ -12,19 +12,15 @@ fi
_prep_test "stripe" "write and verify test"
backfile_0=$(_create_backfile 256M)
backfile_1=$(_create_backfile 256M)
_create_backfile 0 256M
_create_backfile 1 256M
dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1")
_check_add_dev $TID $? "${backfile_0}"
dev_id=$(_add_ublk_dev -t stripe "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M
ERR_CODE=$?
_cleanup_test "stripe"
_remove_backfile "$backfile_0"
_remove_backfile "$backfile_1"
_show_result $TID $ERR_CODE

View File

@@ -8,17 +8,14 @@ ERR_CODE=0
_prep_test "stripe" "mkfs & mount & umount"
backfile_0=$(_create_backfile 256M)
backfile_1=$(_create_backfile 256M)
dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1")
_check_add_dev $TID $? "$backfile_0" "$backfile_1"
_create_backfile 0 256M
_create_backfile 1 256M
dev_id=$(_add_ublk_dev -t stripe "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
_check_add_dev $TID $?
_mkfs_mount_test /dev/ublkb"${dev_id}"
ERR_CODE=$?
_cleanup_test "stripe"
_remove_backfile "$backfile_0"
_remove_backfile "$backfile_1"
_show_result $TID $ERR_CODE

View File

@@ -12,19 +12,15 @@ fi
_prep_test "stripe" "write and verify test"
backfile_0=$(_create_backfile 256M)
backfile_1=$(_create_backfile 256M)
_create_backfile 0 256M
_create_backfile 1 256M
dev_id=$(_add_ublk_dev -q 2 -t stripe "$backfile_0" "$backfile_1")
_check_add_dev $TID $? "${backfile_0}"
dev_id=$(_add_ublk_dev -q 2 -t stripe "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
_check_add_dev $TID $?
# run fio over the ublk disk
_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" --size=512M
ERR_CODE=$?
_cleanup_test "stripe"
_remove_backfile "$backfile_0"
_remove_backfile "$backfile_1"
_show_result $TID $ERR_CODE

View File

@@ -8,17 +8,14 @@ ERR_CODE=0
_prep_test "stripe" "mkfs & mount & umount on zero copy"
backfile_0=$(_create_backfile 256M)
backfile_1=$(_create_backfile 256M)
dev_id=$(_add_ublk_dev -t stripe -z -q 2 "$backfile_0" "$backfile_1")
_check_add_dev $TID $? "$backfile_0" "$backfile_1"
_create_backfile 0 256M
_create_backfile 1 256M
dev_id=$(_add_ublk_dev -t stripe -z -q 2 "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
_check_add_dev $TID $?
_mkfs_mount_test /dev/ublkb"${dev_id}"
ERR_CODE=$?
_cleanup_test "stripe"
_remove_backfile "$backfile_0"
_remove_backfile "$backfile_1"
_show_result $TID $ERR_CODE