Merge tag 'block-7.1-20260515' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull block fixes from Jens Axboe:

 - NVMe merge request via Keith:
     - Fix memory leak on a passthrough integrity mapping failure (Keith)
     - Hide secrets behind debug option (Hannes)
     - Fix pci use-after-free for host memory buffer (Chia-Lin Kao)
     - Fix tcp taregt use-after-free for data digest (Sagi)
     - Revert a mistaken quirk (Alan Cui)
     - Fix uevent and controller state race condition (Maurizio)
     - Fix apple submission queue re-initialization (Nick Chan)

 - Three fixes for blk-integrity, fixing an issue with the user data
   mapping and two problems with recomputing number of segments

 - Two fixes for the iov_iter bounce buffering

 - Fix for the handling of dead zoned write plugs

 - ublk max_sectors validation fix, with associated selftest addition

* tag 'block-7.1-20260515' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  nvme-apple: Reset q->sq_tail during queue init
  block: align down bounces bios
  block: pass a minsize argument to bio_iov_iter_bounce
  selftests: ublk: cap nthreads to kernel's actual nr_hw_queues
  block: fix handling of dead zone write plugs
  block: bio-integrity: Fix null-ptr-deref in bio_integrity_map_user()
  block: recompute nr_integrity_segments in blk_insert_cloned_request
  block: don't overwrite bip_vcnt in bio_integrity_copy_user()
  nvme: fix race condition between connected uevent and STARTED_ONCE flag
  Revert "nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808"
  nvmet-tcp: Fix potential UAF when ddgst mismatch
  nvme-pci: fix use-after-free in nvme_free_host_mem()
  nvmet-auth: Do not print DH-HMAC-CHAP secrets
  nvme: fix bio leak on mapping failure
  nvme: make prp passthrough usage less scary
  ublk: reject max_sectors smaller than PAGE_SECTORS in parameter validation
This commit is contained in:
Linus Torvalds
2026-05-15 12:47:00 -07:00
15 changed files with 130 additions and 45 deletions

View File

@@ -308,7 +308,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
bip->bip_flags |= BIP_COPY_USER;
bip->bip_vcnt = nr_vecs;
return 0;
free_bip:
bio_integrity_free(bio);
@@ -403,6 +402,24 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
if (unlikely(ret < 0))
goto free_bvec;
/*
* Handle partial pinning. This can happen when pin_user_pages_fast()
* returns fewer pages than requested.
*/
if (user_backed_iter(iter) && unlikely(ret != bytes)) {
if (ret > 0) {
int npinned = DIV_ROUND_UP(offset + ret, PAGE_SIZE);
int i;
for (i = 0; i < npinned; i++)
unpin_user_page(pages[i]);
}
if (pages != stack_pages)
kvfree(pages);
ret = -EFAULT;
goto free_bvec;
}
nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset,
&is_p2p);
if (pages != stack_pages)

View File

@@ -1279,11 +1279,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
return bio_iov_iter_align_down(bio, iter, len_align_mask);
}
static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size,
size_t minsize)
{
struct folio *folio;
while (*size > PAGE_SIZE) {
while (*size > minsize) {
folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
if (folio)
return folio;
@@ -1307,7 +1308,7 @@ static void bio_free_folios(struct bio *bio)
}
static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter,
size_t maxlen)
size_t maxlen, size_t minsize)
{
size_t total_len = min(maxlen, iov_iter_count(iter));
@@ -1322,13 +1323,13 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter,
size_t this_len = min(total_len, SZ_1M);
struct folio *folio;
if (this_len > PAGE_SIZE * 2)
if (this_len > minsize * 2)
this_len = rounddown_pow_of_two(this_len);
if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
break;
folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
folio = folio_alloc_greedy(GFP_KERNEL, &this_len, minsize);
if (!folio)
break;
bio_add_folio_nofail(bio, folio, this_len, 0);
@@ -1344,16 +1345,16 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter,
if (!bio->bi_iter.bi_size)
return -ENOMEM;
return 0;
return bio_iov_iter_align_down(bio, iter, minsize - 1);
}
static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
size_t maxlen)
size_t maxlen, size_t minsize)
{
size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M);
struct folio *folio;
folio = folio_alloc_greedy(GFP_KERNEL, &len);
folio = folio_alloc_greedy(GFP_KERNEL, &len, minsize);
if (!folio)
return -ENOMEM;
@@ -1382,7 +1383,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
if (iov_iter_extract_will_pin(iter))
bio_set_flag(bio, BIO_PAGE_PINNED);
return 0;
return bio_iov_iter_align_down(bio, iter, minsize - 1);
}
/**
@@ -1390,6 +1391,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
* @bio: bio to send
* @iter: iter to read from / write into
* @maxlen: maximum size to bounce
* @minsize: minimum folio allocation size
*
* Helper for direct I/O implementations that need to bounce buffer because
* we need to checksum the data or perform other operations that require
@@ -1397,11 +1399,12 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
* copies the data into it. Needs to be paired with bio_iov_iter_unbounce()
* called on completion.
*/
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen)
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen,
size_t minsize)
{
if (op_is_write(bio_op(bio)))
return bio_iov_iter_bounce_write(bio, iter, maxlen);
return bio_iov_iter_bounce_read(bio, iter, maxlen);
return bio_iov_iter_bounce_write(bio, iter, maxlen, minsize);
return bio_iov_iter_bounce_read(bio, iter, maxlen, minsize);
}
static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)

View File

@@ -3307,6 +3307,25 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
return BLK_STS_IOERR;
}
/*
* Integrity segment counting depends on the same queue limits
* (virt_boundary_mask, seg_boundary_mask, max_segment_size) that
* vary across stacked queues, so recompute against the bottom
* queue just like nr_phys_segments above.
*/
if (blk_integrity_rq(rq) && rq->bio) {
unsigned short max_int_segs = queue_max_integrity_segments(q);
rq->nr_integrity_segments =
blk_rq_count_integrity_sg(rq->q, rq->bio);
if (rq->nr_integrity_segments > max_int_segs) {
printk(KERN_ERR "%s: over max integrity segments limit. (%u > %u)\n",
__func__, rq->nr_integrity_segments,
max_int_segs);
return BLK_STS_IOERR;
}
}
if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;

View File

@@ -623,6 +623,28 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
}
}
static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug)
{
if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD))
return false;
/*
* If a new write is received right after a zone reset completes and
* while the disk_zone_wplugs_worker() thread has not yet released the
* reference on the zone write plug after processing the last write to
* the zone, then the new write BIO will see the zone write plug marked
* as dead. This case is however a false positive and a perfectly valid
* pattern. In such case, restore the zone write plug to a live one.
*/
if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) {
zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD;
refcount_inc(&zwplug->ref);
return false;
}
return true;
}
static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
struct blk_zone_wplug *zwplug);
@@ -1444,12 +1466,12 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
spin_lock_irqsave(&zwplug->lock, flags);
/*
* If we got a zone write plug marked as dead, then the user is issuing
* writes to a full zone, or without synchronizing with zone reset or
* zone finish operations. In such case, fail the BIO to signal this
* invalid usage.
* Check if we got a zone write plug marked as dead. If yes, then the
* user is likely issuing writes to a full zone, or without
* synchronizing with zone reset or zone finish operations. In such
* case, fail the BIO to signal this invalid usage.
*/
if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
if (disk_check_zone_wplug_dead(zwplug)) {
spin_unlock_irqrestore(&zwplug->lock, flags);
disk_put_zone_wplug(zwplug);
bio_io_error(bio);

View File

@@ -920,6 +920,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
return -EINVAL;
if (p->max_sectors < PAGE_SECTORS)
return -EINVAL;
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
return -EINVAL;
} else

View File

@@ -1009,6 +1009,7 @@ static void apple_nvme_init_queue(struct apple_nvme_queue *q)
unsigned int depth = apple_nvme_queue_depth(q);
struct apple_nvme *anv = queue_to_apple_nvme(q);
q->sq_tail = 0;
q->cq_head = 0;
q->cq_phase = 1;
if (anv->hw->has_lsq_nvmmu)

View File

@@ -3749,6 +3749,10 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
ret = nvme_hwmon_init(ctrl);
if (ret == -EINTR)
return ret;
if (!nvme_ctrl_sgl_supported(ctrl))
dev_info(ctrl->device,
"passthrough uses implicit buffer lengths\n");
}
clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
@@ -5041,8 +5045,8 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
nvme_mpath_update(ctrl);
}
nvme_change_uevent(ctrl, "NVME_EVENT=connected");
set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
nvme_change_uevent(ctrl, "NVME_EVENT=connected");
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);

View File

@@ -120,21 +120,11 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
int ret;
if (!nvme_ctrl_sgl_supported(ctrl))
dev_warn_once(ctrl->device, "using unchecked data buffer\n");
if (has_metadata) {
if (!supports_metadata)
return -EINVAL;
if (!nvme_ctrl_meta_sgl_supported(ctrl))
dev_warn_once(ctrl->device,
"using unchecked metadata buffer\n");
}
if (has_metadata && !supports_metadata)
return -EINVAL;
if (iter)
ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL);
@@ -154,8 +144,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
return ret;
out_unmap:
if (bio)
blk_rq_unmap_user(bio);
if (req->bio)
blk_rq_unmap_user(req->bio);
return ret;
}

View File

@@ -2533,11 +2533,13 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev)
static void nvme_free_host_mem(struct nvme_dev *dev)
{
if (dev->hmb_sgt)
if (dev->hmb_sgt) {
dma_free_noncontiguous(dev->dev, dev->host_mem_size,
dev->hmb_sgt, DMA_BIDIRECTIONAL);
else
dev->hmb_sgt = NULL;
} else {
nvme_free_host_mem_multi(dev);
}
dma_free_coherent(dev->dev, dev->host_mem_descs_size,
dev->host_mem_descs, dev->host_mem_descs_dma);
@@ -4107,8 +4109,6 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c5f, 0x0555), /* Memblaze Pblaze5 adapter */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
{ PCI_DEVICE(0x144d, 0xa808), /* Samsung PM981/983 */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */

View File

@@ -117,6 +117,15 @@ config NVME_TARGET_AUTH
If unsure, say N.
config NVME_TARGET_AUTH_DEBUG
bool "NVMe over Fabrics In-band Authentication debug messages"
depends on NVME_TARGET_AUTH
help
This enables additional debug messages including the generated
DH-HMAC-CHAP secrets to help debugging authentication failures.
If unsure, say N.
config NVME_TARGET_PCI_EPF
tristate "NVMe PCI Endpoint Function target support"
depends on NVME_TARGET && PCI_ENDPOINT

View File

@@ -144,7 +144,6 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset)
goto out_unlock;
list_for_each_entry(p, &ctrl->subsys->hosts, entry) {
pr_debug("check %s\n", nvmet_host_name(p->host));
if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn))
continue;
host = p->host;
@@ -189,11 +188,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset)
ctrl->host_key = NULL;
goto out_free_hash;
}
#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG
pr_debug("%s: using hash %s key %*ph\n", __func__,
ctrl->host_key->hash > 0 ?
nvme_auth_hmac_name(ctrl->host_key->hash) : "none",
(int)ctrl->host_key->len, ctrl->host_key->key);
#endif
nvme_auth_free_key(ctrl->ctrl_key);
if (!host->dhchap_ctrl_secret) {
ctrl->ctrl_key = NULL;
@@ -207,11 +207,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset)
ctrl->ctrl_key = NULL;
goto out_free_hash;
}
#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG
pr_debug("%s: using ctrl hash %s key %*ph\n", __func__,
ctrl->ctrl_key->hash > 0 ?
nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none",
(int)ctrl->ctrl_key->len, ctrl->ctrl_key->key);
#endif
out_free_hash:
if (ret) {
if (ctrl->host_key) {
@@ -317,7 +318,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
if (ret)
goto out_free_challenge;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
@@ -434,8 +434,10 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
ret = -EINVAL;
} else {
memcpy(buf, ctrl->dh_key, buf_size);
#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG
pr_debug("%s: ctrl %d public key %*ph\n", __func__,
ctrl->cntlid, (int)buf_size, buf);
#endif
}
return ret;
@@ -458,11 +460,12 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
ctrl->shash_id);
if (ret)
pr_debug("failed to compute session key, err %d\n", ret);
#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG
else
pr_debug("%s: session key %*ph\n", __func__,
(int)req->sq->dhchap_skey_len,
req->sq->dhchap_skey);
#endif
return ret;
}

View File

@@ -1321,8 +1321,10 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
queue->idx, cmd->req.cmd->common.command_id,
queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
le32_to_cpu(cmd->exp_ddgst));
if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED))
if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) {
cmd->req.cqe->status = NVME_SC_CMD_SEQ_ERROR;
nvmet_req_uninit(&cmd->req);
}
nvmet_tcp_free_cmd_buffers(cmd);
ret = -EPROTO;
goto out;

View File

@@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
if (dio->flags & IOMAP_DIO_BOUNCE)
ret = bio_iov_iter_bounce(bio, dio->submit.iter,
iomap_max_bio_size(&iter->iomap));
iomap_max_bio_size(&iter->iomap), alignment);
else
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
alignment - 1);

View File

@@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen);
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen,
size_t minsize);
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);
extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,

View File

@@ -1735,6 +1735,17 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
goto fail;
}
/*
* The kernel may reduce nr_hw_queues (e.g. capped to nr_cpu_ids).
* Cap nthreads to the actual queue count to avoid creating extra
* handler threads that will hang during device removal.
*
* per_io_tasks mode is excluded: threads interleave across all
* queues so nthreads > nr_hw_queues is valid and intentional.
*/
if (!ctx->per_io_tasks && dev->nthreads > info->nr_hw_queues)
dev->nthreads = info->nr_hw_queues;
ret = ublk_start_daemon(ctx, dev);
ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
if (ret < 0)