From 4606467a75cfc16721937272ed29462a750b60c8 Mon Sep 17 00:00:00 2001 From: Shivam Kumar Date: Wed, 18 Mar 2026 18:56:58 -0400 Subject: [PATCH 01/22] nvmet-tcp: check INIT_FAILED before nvmet_req_uninit in digest error path In nvmet_tcp_try_recv_ddgst(), when a data digest mismatch is detected, nvmet_req_uninit() is called unconditionally. However, if the command arrived via the nvmet_tcp_handle_req_failure() path, nvmet_req_init() had returned false and percpu_ref_tryget_live() was never executed. The unconditional percpu_ref_put() inside nvmet_req_uninit() then causes a refcount underflow, leading to a WARNING in percpu_ref_switch_to_atomic_rcu, a use-after-free diagnostic, and eventually a permanent workqueue deadlock. Check cmd->flags & NVMET_TCP_F_INIT_FAILED before calling nvmet_req_uninit(), matching the existing pattern in nvmet_tcp_execute_request(). Reviewed-by: Christoph Hellwig Signed-off-by: Shivam Kumar Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4b8b02341ddc..69e971b179ae 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1310,7 +1310,8 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) queue->idx, cmd->req.cmd->common.command_id, queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), le32_to_cpu(cmd->exp_ddgst)); - nvmet_req_uninit(&cmd->req); + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + nvmet_req_uninit(&cmd->req); nvmet_tcp_free_cmd_buffers(cmd); nvmet_tcp_fatal_error(queue); ret = -EPROTO; From 723277b15ed97185ce6f75abbf19f06e00f0a6f5 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 31 Mar 2026 16:17:31 +0800 Subject: [PATCH 02/22] nvme: add missing MODULE_ALIAS for fabrics transports The generic fabrics layer uses request_module("nvme-%s", opts->transport) to auto-load transport modules. Currently, the nvme-tcp, nvme-rdma, and nvme-fc modules lack MODULE_ALIAS entries for these names, which prevents the kernel from automatically finding and loading them when requested. Reviewed-by: Christoph Hellwig Signed-off-by: Geliang Tang Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e1bb4707183c..e4f4528fe2a2 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3968,3 +3968,4 @@ module_exit(nvme_fc_exit_module); MODULE_DESCRIPTION("NVMe host FC transport driver"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvme-fc"); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 57111139e84f..1ec6e867aedb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2432,3 +2432,4 @@ module_exit(nvme_rdma_cleanup_module); MODULE_DESCRIPTION("NVMe host RDMA transport driver"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvme-rdma"); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 243dab830dc8..02c95c32b07e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -3071,3 +3071,4 @@ module_exit(nvme_tcp_cleanup_module); MODULE_DESCRIPTION("NVMe host TCP transport driver"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvme-tcp"); From 0a5a94648627a438067fc8a6dd178187ceb112fb Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 8 Apr 2026 09:02:26 +0000 Subject: [PATCH 03/22] nvmet: introduce new mdts configuration entry Using this port configuration, one will be able to set the Maximum Data Transfer Size (MDTS) for any controller that will be associated to the configured port. The default value remains 0 (no limit). Signed-off-by: Max Gurtovoy Signed-off-by: Aurelien Aptel Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 8 ++------ drivers/nvme/target/configfs.c | 27 +++++++++++++++++++++++++++ drivers/nvme/target/core.c | 8 ++++++++ drivers/nvme/target/nvmet.h | 13 +++++++++++++ drivers/nvme/target/zns.c | 6 +----- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3794ef258556..3842087209da 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -687,12 +687,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | NVME_CTRL_CMIC_ANA; - /* Limit MDTS according to transport capability */ - if (ctrl->ops->get_mdts) - id->mdts = ctrl->ops->get_mdts(ctrl); - else - id->mdts = 0; - + /* Limit MDTS according to port config or transport capability */ + id->mdts = nvmet_ctrl_mdts(req); id->cntlid = cpu_to_le16(ctrl->cntlid); id->ver = cpu_to_le32(ctrl->subsys->ver); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 463348c7f097..b88f897f06e2 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -301,6 +301,31 @@ static ssize_t nvmet_param_max_queue_size_store(struct config_item *item, CONFIGFS_ATTR(nvmet_, param_max_queue_size); +static ssize_t nvmet_param_mdts_show(struct config_item *item, char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->mdts); +} + +static ssize_t nvmet_param_mdts_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + ret = kstrtoint(page, 0, &port->mdts); + if (ret) { + pr_err("Invalid value '%s' for mdts\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_mdts); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_param_pi_enable_show(struct config_item *item, char *page) @@ -1995,6 +2020,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_tsas, &nvmet_attr_param_inline_data_size, &nvmet_attr_param_max_queue_size, + &nvmet_attr_param_mdts, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_attr_param_pi_enable, #endif @@ -2053,6 +2079,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, INIT_LIST_HEAD(&port->referrals); port->inline_data_size = -1; /* < 0 == let the transport choose */ port->max_queue_size = -1; /* < 0 == let the transport choose */ + port->mdts = -1; /* < 0 == let the transport choose */ port->disc_addr.trtype = NVMF_TRTYPE_MAX; port->disc_addr.portid = cpu_to_le16(portid); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 03cc7d5f9683..33db6c5534e2 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -368,6 +368,14 @@ int nvmet_enable_port(struct nvmet_port *port) NVMET_MIN_QUEUE_SIZE, NVMET_MAX_QUEUE_SIZE); + /* + * If the transport didn't set the mdts properly, then clamp it to the + * target limits. Also set default values in case the transport didn't + * set it at all. + */ + if (port->mdts < 0 || port->mdts > NVMET_MAX_MDTS) + port->mdts = 0; + port->enabled = true; port->tr_ops = ops; return 0; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 5db8f0d6e3f2..64af6bf569bf 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -214,6 +214,7 @@ struct nvmet_port { bool enabled; int inline_data_size; int max_queue_size; + int mdts; const struct nvmet_fabrics_ops *tr_ops; bool pi_enable; }; @@ -672,6 +673,7 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, #define NVMET_MAX_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1) +#define NVMET_MAX_MDTS 255 /* * Nice round number that makes a list of nsids fit into a page. @@ -760,6 +762,17 @@ static inline bool nvmet_is_pci_ctrl(struct nvmet_ctrl *ctrl) return ctrl->port->disc_addr.trtype == NVMF_TRTYPE_PCI; } +/* Limit MDTS according to port config or transport capability */ +static inline u8 nvmet_ctrl_mdts(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u8 mdts = req->port->mdts; + + if (!ctrl->ops->get_mdts) + return mdts; + return min_not_zero(ctrl->ops->get_mdts(ctrl), mdts); +} + #ifdef CONFIG_NVME_TARGET_PASSTHRU void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index aeaf73b54c3a..f00921931eb6 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -69,7 +69,6 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) void nvmet_execute_identify_ctrl_zns(struct nvmet_req *req) { u8 zasl = req->sq->ctrl->subsys->zasl; - struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl_zns *id; u16 status; @@ -79,10 +78,7 @@ void nvmet_execute_identify_ctrl_zns(struct nvmet_req *req) goto out; } - if (ctrl->ops->get_mdts) - id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl); - else - id->zasl = zasl; + id->zasl = min_not_zero(nvmet_ctrl_mdts(req), zasl); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); From 23528aa3320a74b028e990b5a939fed32a8afc2f Mon Sep 17 00:00:00 2001 From: Shivaji Kant Date: Mon, 6 Apr 2026 09:21:32 +0000 Subject: [PATCH 04/22] nvme: enable PCI P2PDMA support for RDMA transport Enable BLK_FEAT_PCI_P2PDMA on the NVMe when the underlying RDMA controller supports it. Suggested-by: Pranjal Shrivastava Reviewed-by: Pranjal Shrivastava Reviewed-by: Henrique Carvalho Reviewed-by: Christoph Hellwig Signed-off-by: Shivaji Kant Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1ec6e867aedb..f77c960f7632 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2189,6 +2189,13 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_rdma_reconnect_or_remove(ctrl, ret); } +static bool nvme_rdma_supports_pci_p2pdma(struct nvme_ctrl *ctrl) +{ + struct nvme_rdma_ctrl *r_ctrl = to_rdma_ctrl(ctrl); + + return ib_dma_pci_p2p_dma_supported(r_ctrl->device->dev); +} + static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, @@ -2203,6 +2210,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .get_address = nvmf_get_address, .stop_ctrl = nvme_rdma_stop_ctrl, .get_virt_boundary = nvme_get_virt_boundary, + .supports_pci_p2pdma = nvme_rdma_supports_pci_p2pdma, }; /* From ea8e356acb165cb1fd75537a52e1f66e5e76c538 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 16 Mar 2026 15:39:35 +0100 Subject: [PATCH 05/22] nvmet-tcp: propagate nvmet_tcp_build_pdu_iovec() errors to its callers Currently, when nvmet_tcp_build_pdu_iovec() detects an out-of-bounds PDU length or offset, it triggers nvmet_tcp_fatal_error(cmd->queue) and returns early. However, because the function returns void, the callers are entirely unaware that a fatal error has occurred and that the cmd->recv_msg.msg_iter was left uninitialized. Callers such as nvmet_tcp_handle_h2c_data_pdu() proceed to blindly overwrite the queue state with queue->rcv_state = NVMET_TCP_RECV_DATA Consequently, the socket receiving loop may attempt to read incoming network data into the uninitialized iterator. Fix this by shifting the error handling responsibility to the callers. Fixes: 52a0a9854934 ("nvmet-tcp: add bounds checks in nvmet_tcp_build_pdu_iovec") Reviewed-by: Hannes Reinecke Reviewed-by: Yunje Shin Reviewed-by: Chaitanya Kulkarni Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 51 ++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 69e971b179ae..3ade734c8bcf 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -351,7 +351,7 @@ static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue); -static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) +static int nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) { struct bio_vec *iov = cmd->iov; struct scatterlist *sg; @@ -364,22 +364,19 @@ static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) offset = cmd->rbytes_done; cmd->sg_idx = offset / PAGE_SIZE; sg_offset = offset % PAGE_SIZE; - if (!cmd->req.sg_cnt || cmd->sg_idx >= cmd->req.sg_cnt) { - nvmet_tcp_fatal_error(cmd->queue); - return; - } + if (!cmd->req.sg_cnt || cmd->sg_idx >= cmd->req.sg_cnt) + return -EPROTO; + sg = &cmd->req.sg[cmd->sg_idx]; sg_remaining = cmd->req.sg_cnt - cmd->sg_idx; while (length) { - if (!sg_remaining) { - nvmet_tcp_fatal_error(cmd->queue); - return; - } - if (!sg->length || sg->length <= sg_offset) { - nvmet_tcp_fatal_error(cmd->queue); - return; - } + if (!sg_remaining) + return -EPROTO; + + if (!sg->length || sg->length <= sg_offset) + return -EPROTO; + u32 iov_len = min_t(u32, length, sg->length - sg_offset); bvec_set_page(iov, sg_page(sg), iov_len, @@ -394,6 +391,7 @@ static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) iov_iter_bvec(&cmd->recv_msg.msg_iter, ITER_DEST, cmd->iov, nr_pages, cmd->pdu_len); + return 0; } static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) @@ -931,7 +929,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) return 0; } -static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, +static int nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) { size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); @@ -947,19 +945,23 @@ static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, if (!nvme_is_write(cmd->req.cmd) || !data_len || data_len > cmd->req.port->inline_data_size) { nvmet_prepare_receive_pdu(queue); - return; + return 0; } ret = nvmet_tcp_map_data(cmd); if (unlikely(ret)) { pr_err("queue %d: failed to map data\n", queue->idx); nvmet_tcp_fatal_error(queue); - return; + return -EPROTO; } queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_build_pdu_iovec(cmd); cmd->flags |= NVMET_TCP_F_INIT_FAILED; + ret = nvmet_tcp_build_pdu_iovec(cmd); + if (unlikely(ret)) + pr_err("queue %d: failed to build PDU iovec\n", queue->idx); + + return ret; } static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) @@ -1011,7 +1013,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) goto err_proto; } cmd->pdu_recv = 0; - nvmet_tcp_build_pdu_iovec(cmd); + if (unlikely(nvmet_tcp_build_pdu_iovec(cmd))) { + pr_err("queue %d: failed to build PDU iovec\n", queue->idx); + goto err_proto; + } queue->cmd = cmd; queue->rcv_state = NVMET_TCP_RECV_DATA; @@ -1074,8 +1079,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) le32_to_cpu(req->cmd->common.dptr.sgl.length), le16_to_cpu(req->cqe->status)); - nvmet_tcp_handle_req_failure(queue, queue->cmd, req); - return 0; + return nvmet_tcp_handle_req_failure(queue, queue->cmd, req); } ret = nvmet_tcp_map_data(queue->cmd); @@ -1092,8 +1096,11 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (nvmet_tcp_need_data_in(queue->cmd)) { if (nvmet_tcp_has_inline_data(queue->cmd)) { queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_build_pdu_iovec(queue->cmd); - return 0; + ret = nvmet_tcp_build_pdu_iovec(queue->cmd); + if (unlikely(ret)) + pr_err("queue %d: failed to build PDU iovec\n", + queue->idx); + return ret; } /* send back R2T */ nvmet_tcp_queue_response(&queue->cmd->req); From bad44c9c312f07b590ad7be892a95693baba976e Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 16 Mar 2026 15:39:36 +0100 Subject: [PATCH 06/22] nvmet-tcp: remove redundant calls to nvmet_tcp_fatal_error() Executing nvmet_tcp_fatal_error() is generally the responsibility of the caller (nvmet_tcp_try_recv); all other functions should just return the error code. Remove the nvmet_tcp_fatal_error() function, it's not needed anymore. Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 3ade734c8bcf..a4c3c62e33f5 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -349,8 +349,6 @@ static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) cmd->req.sg = NULL; } -static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue); - static int nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) { struct bio_vec *iov = cmd->iov; @@ -394,22 +392,13 @@ static int nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) return 0; } -static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) -{ - queue->rcv_state = NVMET_TCP_RECV_ERR; - if (queue->nvme_sq.ctrl) - nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); - else - kernel_sock_shutdown(queue->sock, SHUT_RDWR); -} - static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) { queue->rcv_state = NVMET_TCP_RECV_ERR; - if (status == -EPIPE || status == -ECONNRESET) + if (status == -EPIPE || status == -ECONNRESET || !queue->nvme_sq.ctrl) kernel_sock_shutdown(queue->sock, SHUT_RDWR); else - nvmet_tcp_fatal_error(queue); + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); } static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) @@ -885,7 +874,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { pr_err("bad nvme-tcp pdu length (%d)\n", le32_to_cpu(icreq->hdr.plen)); - nvmet_tcp_fatal_error(queue); return -EPROTO; } @@ -951,7 +939,6 @@ static int nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, ret = nvmet_tcp_map_data(cmd); if (unlikely(ret)) { pr_err("queue %d: failed to map data\n", queue->idx); - nvmet_tcp_fatal_error(queue); return -EPROTO; } @@ -1024,7 +1011,6 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) err_proto: /* FIXME: use proper transport errors */ - nvmet_tcp_fatal_error(queue); return -EPROTO; } @@ -1039,7 +1025,6 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (hdr->type != nvme_tcp_icreq) { pr_err("unexpected pdu type (%d) before icreq\n", hdr->type); - nvmet_tcp_fatal_error(queue); return -EPROTO; } return nvmet_tcp_handle_icreq(queue); @@ -1048,7 +1033,6 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(hdr->type == nvme_tcp_icreq)) { pr_err("queue %d: received icreq pdu in state %d\n", queue->idx, queue->state); - nvmet_tcp_fatal_error(queue); return -EPROTO; } @@ -1065,7 +1049,6 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", queue->idx, queue->nr_cmds, queue->send_list_len, nvme_cmd->common.opcode); - nvmet_tcp_fatal_error(queue); return -ENOMEM; } @@ -1086,9 +1069,9 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(ret)) { pr_err("queue %d: failed to map data\n", queue->idx); if (nvmet_tcp_has_inline_data(queue->cmd)) - nvmet_tcp_fatal_error(queue); - else - nvmet_req_complete(req, ret); + return -EPROTO; + + nvmet_req_complete(req, ret); ret = -EAGAIN; goto out; } @@ -1211,7 +1194,6 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { pr_err("unexpected pdu type %d\n", hdr->type); - nvmet_tcp_fatal_error(queue); return -EIO; } @@ -1225,16 +1207,12 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) } if (queue->hdr_digest && - nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { - nvmet_tcp_fatal_error(queue); /* fatal */ + nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) return -EPROTO; - } if (queue->data_digest && - nvmet_tcp_check_ddgst(queue, &queue->pdu)) { - nvmet_tcp_fatal_error(queue); /* fatal */ + nvmet_tcp_check_ddgst(queue, &queue->pdu)) return -EPROTO; - } return nvmet_tcp_done_recv_pdu(queue); } @@ -1320,7 +1298,6 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) nvmet_req_uninit(&cmd->req); nvmet_tcp_free_cmd_buffers(cmd); - nvmet_tcp_fatal_error(queue); ret = -EPROTO; goto out; } From 5293a8882c549fab4a878bc76b0b6c951f980a61 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 8 Apr 2026 00:51:31 -0700 Subject: [PATCH 07/22] nvmet-tcp: fix race between ICReq handling and queue teardown nvmet_tcp_handle_icreq() updates queue->state after sending an Initialization Connection Response (ICResp), but it does so without serializing against target-side queue teardown. If an NVMe/TCP host sends an Initialization Connection Request (ICReq) and immediately closes the connection, target-side teardown may start in softirq context before io_work drains the already buffered ICReq. In that case, nvmet_tcp_schedule_release_queue() sets queue->state to NVMET_TCP_Q_DISCONNECTING and drops the queue reference under state_lock. If io_work later processes that ICReq, nvmet_tcp_handle_icreq() can still overwrite the state back to NVMET_TCP_Q_LIVE. That defeats the DISCONNECTING-state guard in nvmet_tcp_schedule_release_queue() and allows a later socket state change to re-enter teardown and issue a second kref_put() on an already released queue. The ICResp send failure path has the same problem. If teardown has already moved the queue to DISCONNECTING, a send error can still overwrite the state with NVMET_TCP_Q_FAILED, again reopening the window for a second teardown path to drop the queue reference. Fix this by serializing both post-send state transitions with state_lock and bailing out if teardown has already started. Use -ESHUTDOWN as an internal sentinel for that bail-out path rather than propagating it as a transport error like -ECONNRESET. Keep nvmet_tcp_socket_error() setting rcv_state to NVMET_TCP_RECV_ERR before honoring that sentinel so receive-side parsing stays quiesced until the existing release path completes. Fixes: c46a6465bac2 ("nvmet-tcp: add NVMe over TCP target driver") Cc: stable@vger.kernel.org Reported-by: Shivam Kumar Tested-by: Shivam Kumar Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index a4c3c62e33f5..164a564ba3b4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -394,6 +394,19 @@ static int nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) { + /* + * Keep rcv_state at RECV_ERR even for the internal -ESHUTDOWN path. + * nvmet_tcp_handle_icreq() can return -ESHUTDOWN after the ICReq has + * already been consumed and queue teardown has started. + * + * If nvmet_tcp_data_ready() or nvmet_tcp_write_space() queues + * nvmet_tcp_io_work() again before nvmet_tcp_release_queue_work() + * cancels it, the queue must not keep that old receive state. + * Otherwise the next nvmet_tcp_io_work() run can reach + * nvmet_tcp_done_recv_pdu() and try to handle the same ICReq again. + * + * That is why queue->rcv_state needs to be updated before we return. + */ queue->rcv_state = NVMET_TCP_RECV_ERR; if (status == -EPIPE || status == -ECONNRESET || !queue->nvme_sq.ctrl) kernel_sock_shutdown(queue->sock, SHUT_RDWR); @@ -908,11 +921,24 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) iov.iov_len = sizeof(*icresp); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (ret < 0) { + spin_lock_bh(&queue->state_lock); + if (queue->state == NVMET_TCP_Q_DISCONNECTING) { + spin_unlock_bh(&queue->state_lock); + return -ESHUTDOWN; + } queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); return ret; /* queue removal will cleanup */ } + spin_lock_bh(&queue->state_lock); + if (queue->state == NVMET_TCP_Q_DISCONNECTING) { + spin_unlock_bh(&queue->state_lock); + /* Tell nvmet_tcp_socket_error() teardown is in progress. */ + return -ESHUTDOWN; + } queue->state = NVMET_TCP_Q_LIVE; + spin_unlock_bh(&queue->state_lock); nvmet_prepare_receive_pdu(queue); return 0; } From 7f991e3f9b8f044640bcb5fa8570350a68932843 Mon Sep 17 00:00:00 2001 From: Alan Cui Date: Thu, 9 Apr 2026 16:15:25 +0800 Subject: [PATCH 08/22] nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808 (Samsung PM981/983/970 EVO Plus ) The firmware for Samsung 970 Evo Plus / PM981 / PM983 does not support SUBNQN. Make quirks to suppress warnings. # nvme id-ctrl /dev/nvme1n1 NVME Identify Controller: vid : 0x144d ssvid : 0x144d sn : *** mn : Samsung SSD 970 EVO Plus 500GB fr : 2B2QEXM7 mcdqpc : 0 subnqn : ioccsz : 0 Signed-off-by: Alan Cui Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9aa19255b041..c693308c6dd6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4102,6 +4102,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa808), /* Samsung PM981/983 */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ From 7d435caacd91d23ebba281c4aac859196e1e2938 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 8 Apr 2026 08:03:57 +0000 Subject: [PATCH 09/22] nvme-multipath: drop head pointer check in nvme_mpath_clear_current_path() A NS will always have a head pointer, so drop the check. As proof in practice, all the nvme_mpath_clear_current_path() callers also dereference ns->head. This check has endured since the original changes to support multipath. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index ba00f0b72b85..263161cb8ac0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -231,16 +231,12 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns) bool changed = false; int node; - if (!head) - goto out; - for_each_node(node) { if (ns == rcu_access_pointer(head->current_path[node])) { rcu_assign_pointer(head->current_path[node], NULL); changed = true; } } -out: return changed; } From aade8abd8b868b6ffa9697aadaea28ec7f65bee6 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 8 Apr 2026 17:56:47 -0700 Subject: [PATCH 10/22] nvmet: avoid recursive nvmet-wq flush in nvmet_ctrl_free nvmet_tcp_release_queue_work() runs on nvmet-wq and can drop the final controller reference through nvmet_cq_put(). If that triggers nvmet_ctrl_free(), the teardown path flushes ctrl->async_event_work on the same nvmet-wq. Call chain: nvmet_tcp_schedule_release_queue() kref_put(&queue->kref, nvmet_tcp_release_queue) nvmet_tcp_release_queue() queue_work(nvmet_wq, &queue->release_work) <--- nvmet_wq process_one_work() nvmet_tcp_release_queue_work() nvmet_cq_put(&queue->nvme_cq) nvmet_cq_destroy() nvmet_ctrl_put(cq->ctrl) nvmet_ctrl_free() flush_work(&ctrl->async_event_work) <--- nvmet_wq Previously Scheduled by :- nvmet_add_async_event queue_work(nvmet_wq, &ctrl->async_event_work); This trips lockdep with a possible recursive locking warning. [ 5223.015876] run blktests nvme/003 at 2026-04-07 20:53:55 [ 5223.061801] loop0: detected capacity change from 0 to 2097152 [ 5223.072206] nvmet: adding nsid 1 to subsystem blktests-subsystem-1 [ 5223.088368] nvmet_tcp: enabling port 0 (127.0.0.1:4420) [ 5223.126086] nvmet: Created discovery controller 1 for subsystem nqn.2014-08.org.nvmexpress.discovery for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349. [ 5223.128453] nvme nvme1: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery", addr 127.0.0.1:4420, hostnqn: nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349 [ 5233.199447] nvme nvme1: Removing ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery" [ 5233.227718] ============================================ [ 5233.231283] WARNING: possible recursive locking detected [ 5233.234696] 7.0.0-rc3nvme+ #20 Tainted: G O N [ 5233.238434] -------------------------------------------- [ 5233.241852] kworker/u192:6/2413 is trying to acquire lock: [ 5233.245429] ffff888111632548 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x26/0x90 [ 5233.251438] but task is already holding lock: [ 5233.255254] ffff888111632548 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x5cc/0x6e0 [ 5233.261125] other info that might help us debug this: [ 5233.265333] Possible unsafe locking scenario: [ 5233.269217] CPU0 [ 5233.270795] ---- [ 5233.272436] lock((wq_completion)nvmet-wq); [ 5233.275241] lock((wq_completion)nvmet-wq); [ 5233.278020] *** DEADLOCK *** [ 5233.281793] May be due to missing lock nesting notation [ 5233.286195] 3 locks held by kworker/u192:6/2413: [ 5233.289192] #0: ffff888111632548 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x5cc/0x6e0 [ 5233.294569] #1: ffffc9000e2a7e40 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x1c5/0x6e0 [ 5233.300128] #2: ffffffff82d7dc40 (rcu_read_lock){....}-{1:3}, at: __flush_work+0x62/0x530 [ 5233.304290] stack backtrace: [ 5233.306520] CPU: 4 UID: 0 PID: 2413 Comm: kworker/u192:6 Tainted: G O N 7.0.0-rc3nvme+ #20 PREEMPT(full) [ 5233.306524] Tainted: [O]=OOT_MODULE, [N]=TEST [ 5233.306525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 [ 5233.306527] Workqueue: nvmet-wq nvmet_tcp_release_queue_work [nvmet_tcp] [ 5233.306532] Call Trace: [ 5233.306534] [ 5233.306536] dump_stack_lvl+0x73/0xb0 [ 5233.306552] print_deadlock_bug+0x225/0x2f0 [ 5233.306556] __lock_acquire+0x13f0/0x2290 [ 5233.306563] lock_acquire+0xd0/0x300 [ 5233.306565] ? touch_wq_lockdep_map+0x26/0x90 [ 5233.306571] ? __flush_work+0x20b/0x530 [ 5233.306573] ? touch_wq_lockdep_map+0x26/0x90 [ 5233.306577] touch_wq_lockdep_map+0x3b/0x90 [ 5233.306580] ? touch_wq_lockdep_map+0x26/0x90 [ 5233.306583] ? __flush_work+0x20b/0x530 [ 5233.306585] __flush_work+0x268/0x530 [ 5233.306588] ? __pfx_wq_barrier_func+0x10/0x10 [ 5233.306594] ? xen_error_entry+0x30/0x60 [ 5233.306600] nvmet_ctrl_free+0x140/0x310 [nvmet] [ 5233.306617] nvmet_cq_put+0x74/0x90 [nvmet] [ 5233.306629] nvmet_tcp_release_queue_work+0x19f/0x360 [nvmet_tcp] [ 5233.306634] process_one_work+0x206/0x6e0 [ 5233.306640] worker_thread+0x184/0x320 [ 5233.306643] ? __pfx_worker_thread+0x10/0x10 [ 5233.306646] kthread+0xf1/0x130 [ 5233.306648] ? __pfx_kthread+0x10/0x10 [ 5233.306651] ret_from_fork+0x355/0x450 [ 5233.306653] ? __pfx_kthread+0x10/0x10 [ 5233.306656] ret_from_fork_asm+0x1a/0x30 [ 5233.306664] There is also no need to flush async_event_work from controller teardown. The admin queue teardown already fails outstanding AER requests before the final controller put :- nvmet_sq_destroy(admin sq) nvmet_async_events_failall(ctrl) The controller has already been removed from the subsystem list before nvmet_ctrl_free() quiesces outstanding work. Replace flush_work() with cancel_work_sync() so a pending async_event_work item is canceled and a running instance is waited on without recursing into the same workqueue. Fixes: 06406d81a2d7 ("nvmet: cancel fatal error and flush async work before free controller") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 33db6c5534e2..a87567f40c91 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1749,7 +1749,7 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_stop_keep_alive_timer(ctrl); - flush_work(&ctrl->async_event_work); + cancel_work_sync(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); nvmet_destroy_auth(ctrl); From e80e39f25567310c1c7392eed886890b5c6788ba Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Wed, 8 Apr 2026 14:45:22 +0200 Subject: [PATCH 11/22] nvme-core: fix parameter name in comment In the declaration of the structure "core_quirks[]", in the comment referred to the devices "Kioxia CD6-V Series / HPE PE8030", the parameter "default_ps_max_latency_us" is reported in a wrong way: nvme_core.default_ps_max_latency=0 The correct form is, instead: nvme_core.default_ps_max_latency_us=0 Reviewed-by: Christoph Hellwig Signed-off-by: Flavio Suligoi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b42d8768d297..0d623476e36f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3044,7 +3044,7 @@ static const struct nvme_core_quirk_entry core_quirks[] = { * * The device is left in a state where it is also not possible * to use "nvme set-feature" to disable APST, but booting with - * nvme_core.default_ps_max_latency=0 works. + * nvme_core.default_ps_max_latency_us=0 works. */ .vid = 0x1e0f, .mn = "KCD6XVUL6T40", From ba9d308ccd6732dd97ed8080d834a4a89e758e14 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 8 Apr 2026 17:18:14 +0300 Subject: [PATCH 12/22] nvme-apple: drop invalid put of admin queue reference count Commit 03b3bcd319b3 ("nvme: fix admin request_queue lifetime") moved the admin queue reference ->put call into nvme_free_ctrl() - a controller device release callback performed for every nvme driver doing nvme_init_ctrl(). nvme-apple sets refcount of the admin queue to 1 at allocation during the probe function and then puts it twice now: nvme_free_ctrl() blk_put_queue(ctrl->admin_q) // #1 ->free_ctrl() apple_nvme_free_ctrl() blk_put_queue(anv->ctrl.admin_q) // #2 Note that there is a commit 941f7298c70c ("nvme-apple: remove an extra queue reference") which intended to drop taking an extra admin queue reference. Looks like at that moment it accidentally fixed a refcount leak, which existed since the driver's introduction. There were two ->get calls at driver's probe function and a single ->put inside apple_nvme_free_ctrl(). However now after commit 03b3bcd319b3 ("nvme: fix admin request_queue lifetime") the refcount is imbalanced again. Fix it by removing extra ->put call from apple_nvme_free_ctrl(). anv->dev and ctrl->dev point to the same device, so use ctrl->dev directly for simplification. Compile tested only. Found by Linux Verification Center (linuxtesting.org). Fixes: 03b3bcd319b3 ("nvme: fix admin request_queue lifetime") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Fedor Pchelkin Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index ed61b97fde59..423c9c628e7b 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1267,11 +1267,7 @@ static int apple_nvme_get_address(struct nvme_ctrl *ctrl, char *buf, int size) static void apple_nvme_free_ctrl(struct nvme_ctrl *ctrl) { - struct apple_nvme *anv = ctrl_to_apple_nvme(ctrl); - - if (anv->ctrl.admin_q) - blk_put_queue(anv->ctrl.admin_q); - put_device(anv->dev); + put_device(ctrl->dev); } static const struct nvme_ctrl_ops nvme_ctrl_ops = { From 20925812de7bf5e6fdc133c691ef52b33f700fbc Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 8 Apr 2026 18:19:56 +0200 Subject: [PATCH 13/22] nvme: expose TLS mode It is not possible to determine the active TLS mode from the presence or absence of sysfs attributes like tls_key, tls_configured_key, or dhchap_secret. With the introduction of the concat mode and optional DH-CHAP authentication, different configurations can result in identical sysfs state. This makes user space detection unreliable. Expose the TLS mode explicitly to allow user space to unambiguously identify the active configuration and avoid fragile heuristics in nvme-cli. Reviewed-by: Chris Leech Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 7bf2e972126b..e59758616f27 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -883,10 +883,26 @@ static ssize_t tls_keyring_show(struct device *dev, } static DEVICE_ATTR_RO(tls_keyring); +static ssize_t tls_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + const char *mode; + + if (ctrl->opts->tls) + mode = "tls"; + else + mode = "concat"; + + return sysfs_emit(buf, "%s\n", mode); +} +static DEVICE_ATTR_RO(tls_mode); + static struct attribute *nvme_tls_attrs[] = { &dev_attr_tls_key.attr, &dev_attr_tls_configured_key.attr, &dev_attr_tls_keyring.attr, + &dev_attr_tls_mode.attr, NULL, }; @@ -908,6 +924,9 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) return 0; + if (a == &dev_attr_tls_mode.attr && + !ctrl->opts->tls && !ctrl->opts->concat) + return 0; return a->mode; } From 3f150f0f010f234f34a67897344f18e68fe803f7 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 15 Apr 2026 15:53:58 +0000 Subject: [PATCH 14/22] nvme-multipath: put module reference when delayed removal work is canceled The delayed disk removal work is canceled when a NS (re)appears. However, we do not put the module reference grabbed in nvme_mpath_remove_disk(), so fix that. Reviewed-by: Christoph Hellwig Reviewed-by: Nilay Shroff Reviewed-by: Chaitanya Kulkarni Signed-off-by: John Garry Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0d623476e36f..fead3b7cd4be 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4083,7 +4083,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) mutex_unlock(&ctrl->subsys->lock); #ifdef CONFIG_NVME_MULTIPATH - cancel_delayed_work(&head->remove_work); + if (cancel_delayed_work(&head->remove_work)) + module_put(THIS_MODULE); #endif return 0; From cf92d78a4aa2adbc2b1e687776aabe63c5b97f3f Mon Sep 17 00:00:00 2001 From: Tao Jiang Date: Thu, 16 Apr 2026 01:27:15 +0800 Subject: [PATCH 15/22] nvme-pci: add quirk for Memblaze Pblaze5 (0x1c5f:0x0555) The Memblaze Pblaze5 NVMe device (PCI ID 0x1c5f:0x0555) is detected as a controller on recent kernels (tested on 5.15.85 and 6.8.4), but no namespace is exposed. Tools like lsblk and fdisk do not report any block device. dmesg shows: nvme nvme0: missing or invalid SUBNQN field. The device works correctly on older kernels (e.g. 4.19), suggesting a compatibility issue with newer namespace handling. This indicates the device does not properly support the Namespace Descriptor List feature. Applying NVME_QUIRK_NO_NS_DESC_LIST allows the namespace to be discovered correctly. Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Tao Jiang Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c693308c6dd6..bcf68c198f74 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4102,6 +4102,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0555), /* Memblaze Pblaze5 adapter */ + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, { PCI_DEVICE(0x144d, 0xa808), /* Samsung PM981/983 */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ From bddb911d28d4412a9462e73766a706ff0d74fa77 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Apr 2026 09:02:28 -0700 Subject: [PATCH 16/22] nvme: skip trace completion for host path errors The command was never dispatched for the driver's "host path error", so the command was never actually initialized and there's no corresponding submit trace for the completion. Reported-by: Minsik Jeon Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fead3b7cd4be..d62adebfdc68 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -454,11 +454,10 @@ void nvme_end_req(struct request *req) blk_mq_end_request(req, status); } -void nvme_complete_rq(struct request *req) +static void __nvme_complete_rq(struct request *req) { struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; - trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); /* @@ -493,6 +492,12 @@ void nvme_complete_rq(struct request *req) return; } } + +void nvme_complete_rq(struct request *req) +{ + trace_nvme_complete_rq(req); + __nvme_complete_rq(req); +} EXPORT_SYMBOL_GPL(nvme_complete_rq); void nvme_complete_batch_req(struct request *req) @@ -513,7 +518,7 @@ blk_status_t nvme_host_path_error(struct request *req) { nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; blk_mq_set_request_complete(req); - nvme_complete_rq(req); + __nvme_complete_rq(req); return BLK_STS_OK; } EXPORT_SYMBOL_GPL(nvme_host_path_error); From f920ebd03cd13eb0976d18de77adf325b5461361 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 17 Apr 2026 10:48:08 +1000 Subject: [PATCH 17/22] Revert "nvmet-tcp: Don't free SQ on authentication success" In an attempt to fix REPLACETLSPSK we stopped freeing the secrets on successful connections. This resulted in memory leaks in the kernel, so let's revert the commit. A improved fix is being developed to just avoid clearing the tls_key variable. This reverts commit 2e6eb6b277f593b98f151ea8eff1beb558bbea3b. Closes: https://lore.kernel.org/linux-nvme/CAHj4cs-u3MWQR4idywptMfjEYi4YwObWFx4KVib35dZ5HMBDdw@mail.gmail.com Reviewed-by: Chris Leech Reviewed-by: Hannes Reinecke Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd-auth.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index b9ab80c7a694..f1e613e7c63e 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -395,10 +395,9 @@ void nvmet_execute_auth_send(struct nvmet_req *req) goto complete; } /* Final states, clear up variables */ - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { - nvmet_auth_sq_free(req->sq); + nvmet_auth_sq_free(req->sq); + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) nvmet_ctrl_fatal_error(ctrl); - } complete: nvmet_req_complete(req, status); @@ -574,7 +573,9 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, d, al); kfree(d); done: - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) + nvmet_auth_sq_free(req->sq); + else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { nvmet_auth_sq_free(req->sq); nvmet_ctrl_fatal_error(ctrl); } From 5fc422951c962cc01e654950fc043ebd8fadd865 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 17 Apr 2026 10:48:09 +1000 Subject: [PATCH 18/22] nvmet-tcp: Don't clear tls_key when freeing sq Curently after the host sends a REPLACETLSPSK we free the TLS keys as part of calling nvmet_auth_sq_free() on success. This means when the host sends a follow up REPLACETLSPSK we return CONCAT_MISMATCH as the check for !nvmet_queue_tls_keyid(req->sq) fails. A previous attempt to fix this involed not calling nvmet_auth_sq_free() on successful connections, but that results in memory leaks. Instead we should not clear `tls_key` in nvmet_auth_sq_free(), as that was incorrectly wiping the tls keys which are used for the session. This patch ensures we correctly free the ephemeral session key on connection, yet we don't free the TLS key unless closing the connection. Reviewed-by: Chris Leech Reviewed-by: Hannes Reinecke Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index b34610e2f19d..723b6d5604c1 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -229,9 +229,6 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) void nvmet_auth_sq_free(struct nvmet_sq *sq) { cancel_delayed_work(&sq->auth_expired_work); -#ifdef CONFIG_NVME_TARGET_TCP_TLS - sq->tls_key = NULL; -#endif kfree(sq->dhchap_c1); sq->dhchap_c1 = NULL; kfree(sq->dhchap_c2); From 26bb12b9caafa2e62d638104bf2732f610cdbb0b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 13 Apr 2026 10:16:28 -0700 Subject: [PATCH 19/22] nvme-tcp: teardown circular locking fixes When a controller reset is triggered via sysfs (by writing to /sys/class/nvme//reset_controller), the reset work tears down and re-establishes all queues. The socket release using fput() defers the actual cleanup to task_work delayed_fput workqueue. This deferred cleanup can race with the subsequent queue re-allocation during reset, potentially leading to use-after-free or resource conflicts. Replace fput() with __fput_sync() to ensure synchronous socket release, guaranteeing that all socket resources are fully cleaned up before the function returns. This prevents races during controller reset where new queue setup may begin before the old socket is fully released. * Call chain during reset: nvme_reset_ctrl_work() -> nvme_tcp_teardown_ctrl() -> nvme_tcp_teardown_io_queues() -> nvme_tcp_free_io_queues() -> nvme_tcp_free_queue() <-- fput() -> __fput_sync() -> nvme_tcp_teardown_admin_queue() -> nvme_tcp_free_admin_queue() -> nvme_tcp_free_queue() <-- fput() -> __fput_sync() -> nvme_tcp_setup_ctrl() <-- race with deferred fput memalloc_noreclaim_save() sets PF_MEMALLOC which is intended for tasks performing memory reclaim work that need reserve access. While PF_MEMALLOC prevents the task from entering direct reclaim (causing __need_reclaim() to return false), it does not strip __GFP_IO from gfp flags. The allocator can therefore still trigger writeback I/O when __GFP_IO remains set, which is unsafe when the caller holds block layer locks. Switch to memalloc_noio_save() which sets PF_MEMALLOC_NOIO. This causes current_gfp_context() to strip __GFP_IO|__GFP_FS from every allocation in the scope, making it safe to allocate memory while holding elevator_lock and set->srcu. * The issue can be reproduced using blktests: nvme_trtype=tcp ./check nvme/005 blktests (master) # nvme_trtype=tcp ./check nvme/005 nvme/005 (tr=tcp) (reset local loopback target) [failed] runtime 0.725s ... 0.798s something found in dmesg: [ 108.473940] run blktests nvme/005 at 2025-11-22 16:12:20 [...] ... (See '/root/blktests/results/nodev_tr_tcp/nvme/005.dmesg' for the entire message) blktests (master) # cat /root/blktests/results/nodev_tr_tcp/nvme/005.dmesg [ 108.473940] run blktests nvme/005 at 2025-11-22 16:12:20 [ 108.526983] loop0: detected capacity change from 0 to 2097152 [ 108.555606] nvmet: adding nsid 1 to subsystem blktests-subsystem-1 [ 108.572531] nvmet_tcp: enabling port 0 (127.0.0.1:4420) [ 108.613061] nvmet: Created nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349. [ 108.616832] nvme nvme0: creating 48 I/O queues. [ 108.630791] nvme nvme0: mapped 48/0/0 default/read/poll queues. [ 108.661892] nvme nvme0: new ctrl: NQN "blktests-subsystem-1", addr 127.0.0.1:4420, hostnqn: nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349 [ 108.746639] nvmet: Created nvm controller 2 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349. [ 108.748466] nvme nvme0: creating 48 I/O queues. [ 108.802984] nvme nvme0: mapped 48/0/0 default/read/poll queues. [ 108.829983] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1" [ 108.854288] block nvme0n1: no available path - failing I/O [ 108.854344] block nvme0n1: no available path - failing I/O [ 108.854373] Buffer I/O error on dev nvme0n1, logical block 1, async page read [ 108.891693] ====================================================== [ 108.895912] WARNING: possible circular locking dependency detected [ 108.900184] 6.17.0nvme+ #3 Tainted: G N [ 108.903913] ------------------------------------------------------ [ 108.908171] nvme/2734 is trying to acquire lock: [ 108.911957] ffff88810210e610 (set->srcu){.+.+}-{0:0}, at: __synchronize_srcu+0x17/0x170 [ 108.917587] but task is already holding lock: [ 108.921570] ffff88813abea198 (&q->elevator_lock){+.+.}-{4:4}, at: elevator_change+0xa8/0x1c0 [ 108.927361] which lock already depends on the new lock. [ 108.933018] the existing dependency chain (in reverse order) is: [ 108.938223] -> #4 (&q->elevator_lock){+.+.}-{4:4}: [ 108.942988] __mutex_lock+0xa2/0x1150 [ 108.945873] elevator_change+0xa8/0x1c0 [ 108.948925] elv_iosched_store+0xdf/0x140 [ 108.952043] kernfs_fop_write_iter+0x16a/0x220 [ 108.955367] vfs_write+0x378/0x520 [ 108.957598] ksys_write+0x67/0xe0 [ 108.959721] do_syscall_64+0x76/0xbb0 [ 108.962052] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 108.965145] -> #3 (&q->q_usage_counter(io)){++++}-{0:0}: [ 108.968923] blk_alloc_queue+0x30e/0x350 [ 108.972117] blk_mq_alloc_queue+0x61/0xd0 [ 108.974677] scsi_alloc_sdev+0x2a0/0x3e0 [ 108.977092] scsi_probe_and_add_lun+0x1bd/0x430 [ 108.979921] __scsi_add_device+0x109/0x120 [ 108.982504] ata_scsi_scan_host+0x97/0x1c0 [ 108.984365] async_run_entry_fn+0x2d/0x130 [ 108.986109] process_one_work+0x20e/0x630 [ 108.987830] worker_thread+0x184/0x330 [ 108.989473] kthread+0x10a/0x250 [ 108.990852] ret_from_fork+0x297/0x300 [ 108.992491] ret_from_fork_asm+0x1a/0x30 [ 108.994159] -> #2 (fs_reclaim){+.+.}-{0:0}: [ 108.996320] fs_reclaim_acquire+0x99/0xd0 [ 108.998058] kmem_cache_alloc_node_noprof+0x4e/0x3c0 [ 109.000123] __alloc_skb+0x15f/0x190 [ 109.002195] tcp_send_active_reset+0x3f/0x1e0 [ 109.004038] tcp_disconnect+0x50b/0x720 [ 109.005695] __tcp_close+0x2b8/0x4b0 [ 109.007227] tcp_close+0x20/0x80 [ 109.008663] inet_release+0x31/0x60 [ 109.010175] __sock_release+0x3a/0xc0 [ 109.011778] sock_close+0x14/0x20 [ 109.013263] __fput+0xee/0x2c0 [ 109.014673] delayed_fput+0x31/0x50 [ 109.016183] process_one_work+0x20e/0x630 [ 109.017897] worker_thread+0x184/0x330 [ 109.019543] kthread+0x10a/0x250 [ 109.020929] ret_from_fork+0x297/0x300 [ 109.022565] ret_from_fork_asm+0x1a/0x30 [ 109.024194] -> #1 (sk_lock-AF_INET-NVME){+.+.}-{0:0}: [ 109.026634] lock_sock_nested+0x2e/0x70 [ 109.028251] tcp_sendmsg+0x1a/0x40 [ 109.029783] sock_sendmsg+0xed/0x110 [ 109.031321] nvme_tcp_try_send_cmd_pdu+0x13e/0x260 [nvme_tcp] [ 109.034263] nvme_tcp_try_send+0xb3/0x330 [nvme_tcp] [ 109.036375] nvme_tcp_queue_rq+0x342/0x3d0 [nvme_tcp] [ 109.038528] blk_mq_dispatch_rq_list+0x297/0x800 [ 109.040448] __blk_mq_sched_dispatch_requests+0x3db/0x5f0 [ 109.042677] blk_mq_sched_dispatch_requests+0x29/0x70 [ 109.044787] blk_mq_run_work_fn+0x76/0x1b0 [ 109.046535] process_one_work+0x20e/0x630 [ 109.048245] worker_thread+0x184/0x330 [ 109.049890] kthread+0x10a/0x250 [ 109.051331] ret_from_fork+0x297/0x300 [ 109.053024] ret_from_fork_asm+0x1a/0x30 [ 109.054740] -> #0 (set->srcu){.+.+}-{0:0}: [ 109.056850] __lock_acquire+0x1468/0x2210 [ 109.058614] lock_sync+0xa5/0x110 [ 109.060048] __synchronize_srcu+0x49/0x170 [ 109.061802] elevator_switch+0xc9/0x330 [ 109.063950] elevator_change+0x128/0x1c0 [ 109.065675] elevator_set_none+0x4c/0x90 [ 109.067316] blk_unregister_queue+0xa8/0x110 [ 109.069165] __del_gendisk+0x14e/0x3c0 [ 109.070824] del_gendisk+0x75/0xa0 [ 109.072328] nvme_ns_remove+0xf2/0x230 [nvme_core] [ 109.074365] nvme_remove_namespaces+0xf2/0x150 [nvme_core] [ 109.076652] nvme_do_delete_ctrl+0x71/0x90 [nvme_core] [ 109.078775] nvme_delete_ctrl_sync+0x3b/0x50 [nvme_core] [ 109.081009] nvme_sysfs_delete+0x34/0x40 [nvme_core] [ 109.083082] kernfs_fop_write_iter+0x16a/0x220 [ 109.085009] vfs_write+0x378/0x520 [ 109.086539] ksys_write+0x67/0xe0 [ 109.087982] do_syscall_64+0x76/0xbb0 [ 109.089577] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 109.091665] other info that might help us debug this: [ 109.095478] Chain exists of: set->srcu --> &q->q_usage_counter(io) --> &q->elevator_lock [ 109.099544] Possible unsafe locking scenario: [ 109.101708] CPU0 CPU1 [ 109.103402] ---- ---- [ 109.105103] lock(&q->elevator_lock); [ 109.106530] lock(&q->q_usage_counter(io)); [ 109.109022] lock(&q->elevator_lock); [ 109.111391] sync(set->srcu); [ 109.112586] *** DEADLOCK *** [ 109.114772] 5 locks held by nvme/2734: [ 109.116189] #0: ffff888101925410 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0x67/0xe0 [ 109.119143] #1: ffff88817a914e88 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0x10f/0x220 [ 109.123141] #2: ffff8881046313f8 (kn->active#185){++++}-{0:0}, at: sysfs_remove_file_self+0x26/0x50 [ 109.126543] #3: ffff88810470e1d0 (&set->update_nr_hwq_lock){++++}-{4:4}, at: del_gendisk+0x6d/0xa0 [ 109.129891] #4: ffff88813abea198 (&q->elevator_lock){+.+.}-{4:4}, at: elevator_change+0xa8/0x1c0 [ 109.133149] stack backtrace: [ 109.134817] CPU: 6 UID: 0 PID: 2734 Comm: nvme Tainted: G N 6.17.0nvme+ #3 PREEMPT(voluntary) [ 109.134819] Tainted: [N]=TEST [ 109.134820] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 109.134821] Call Trace: [ 109.134823] [ 109.134824] dump_stack_lvl+0x75/0xb0 [ 109.134828] print_circular_bug+0x26a/0x330 [ 109.134831] check_noncircular+0x12f/0x150 [ 109.134834] __lock_acquire+0x1468/0x2210 [ 109.134837] ? __synchronize_srcu+0x17/0x170 [ 109.134838] lock_sync+0xa5/0x110 [ 109.134840] ? __synchronize_srcu+0x17/0x170 [ 109.134842] __synchronize_srcu+0x49/0x170 [ 109.134843] ? mark_held_locks+0x49/0x80 [ 109.134845] ? _raw_spin_unlock_irqrestore+0x2d/0x60 [ 109.134847] ? kvm_clock_get_cycles+0x14/0x30 [ 109.134853] ? ktime_get_mono_fast_ns+0x36/0xb0 [ 109.134858] elevator_switch+0xc9/0x330 [ 109.134860] elevator_change+0x128/0x1c0 [ 109.134862] ? kernfs_put.part.0+0x86/0x290 [ 109.134864] elevator_set_none+0x4c/0x90 [ 109.134866] blk_unregister_queue+0xa8/0x110 [ 109.134868] __del_gendisk+0x14e/0x3c0 [ 109.134870] del_gendisk+0x75/0xa0 [ 109.134872] nvme_ns_remove+0xf2/0x230 [nvme_core] [ 109.134879] nvme_remove_namespaces+0xf2/0x150 [nvme_core] [ 109.134887] nvme_do_delete_ctrl+0x71/0x90 [nvme_core] [ 109.134893] nvme_delete_ctrl_sync+0x3b/0x50 [nvme_core] [ 109.134899] nvme_sysfs_delete+0x34/0x40 [nvme_core] [ 109.134905] kernfs_fop_write_iter+0x16a/0x220 [ 109.134908] vfs_write+0x378/0x520 [ 109.134911] ksys_write+0x67/0xe0 [ 109.134913] do_syscall_64+0x76/0xbb0 [ 109.134915] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 109.134916] RIP: 0033:0x7fd68a737317 [ 109.134917] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 [ 109.134919] RSP: 002b:00007ffded1546d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 109.134920] RAX: ffffffffffffffda RBX: 000000000054f7e0 RCX: 00007fd68a737317 [ 109.134921] RDX: 0000000000000001 RSI: 00007fd68a855719 RDI: 0000000000000003 [ 109.134921] RBP: 0000000000000003 R08: 0000000030407850 R09: 00007fd68a7cd4e0 [ 109.134922] R10: 00007fd68a65b130 R11: 0000000000000246 R12: 00007fd68a855719 [ 109.134923] R13: 00000000304074c0 R14: 00000000304074c0 R15: 0000000030408660 [ 109.134926] [ 109.962756] Key type psk unregistered Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 02c95c32b07e..15d36d6a728e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1438,18 +1438,32 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - unsigned int noreclaim_flag; + unsigned int noio_flag; if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; page_frag_cache_drain(&queue->pf_cache); - noreclaim_flag = memalloc_noreclaim_save(); - /* ->sock will be released by fput() */ - fput(queue->sock->file); + /** + * Prevent memory reclaim from triggering block I/O during socket + * teardown. The socket release path fput -> tcp_close -> + * tcp_disconnect -> tcp_send_active_reset may allocate memory, and + * allowing reclaim to issue I/O could deadlock if we're being called + * from block device teardown (e.g., del_gendisk -> elevator cleanup) + * which holds locks that the I/O completion path needs. + */ + noio_flag = memalloc_noio_save(); + + /** + * Release the socket synchronously. During reset in + * nvme_reset_ctrl_work(), queue teardown is immediately followed by + * re-allocation. fput() defers socket cleanup to delayed_fput_work + * in workqueue context, which can race with new queue setup. + */ + __fput_sync(queue->sock->file); queue->sock = NULL; - memalloc_noreclaim_restore(noreclaim_flag); + memalloc_noio_restore(noio_flag); kfree(queue->pdu); mutex_destroy(&queue->send_mutex); @@ -1901,8 +1915,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, err_rcv_pdu: kfree(queue->pdu); err_sock: - /* ->sock will be released by fput() */ - fput(queue->sock->file); + /* Use sync variant - see nvme_tcp_free_queue() for explanation */ + __fput_sync(queue->sock->file); queue->sock = NULL; err_destroy_mutex: mutex_destroy(&queue->send_mutex); From 5d10069e1a1691a0d8642e1fa65f4c1869210299 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 17 Apr 2026 10:50:48 +1000 Subject: [PATCH 20/22] nvme-auth: Include SC_C in RVAL controller hash Section 8.3.4.5.5 of the NVMe Base Specification 2.1 describes what is included in the Response Value (RVAL) hash and SC_C should be included. Currently we are hardcoding 0 instead of using the correct SC_C value. Update the host and target code to use the SC_C when calculating the RVAL instead of using 0. Fixes: e88a7595b57f2 ("nvme-tcp: request secure channel concatenation") Reviewed-by: Chris Leech Reviewed-by: Hannes Reinecke Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 3 ++- drivers/nvme/target/auth.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index bbedbe181c8a..63f543e80998 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -535,11 +535,12 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, put_unaligned_le16(chap->transaction, buf); nvme_auth_hmac_update(&hmac, buf, 2); - memset(buf, 0, 4); + *buf = chap->sc_c; nvme_auth_hmac_update(&hmac, buf, 1); nvme_auth_hmac_update(&hmac, "Controller", 10); nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn, strlen(ctrl->opts->subsysnqn)); + memset(buf, 0, 4); nvme_auth_hmac_update(&hmac, buf, 1); nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn, strlen(ctrl->opts->host->nqn)); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 723b6d5604c1..c35c427ca2ac 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -399,11 +399,12 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, put_unaligned_le16(req->sq->dhchap_tid, buf); nvme_auth_hmac_update(&hmac, buf, 2); - memset(buf, 0, 4); + *buf = req->sq->sc_c; nvme_auth_hmac_update(&hmac, buf, 1); nvme_auth_hmac_update(&hmac, "Controller", 10); nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn, strlen(ctrl->subsys->subsysnqn)); + memset(buf, 0, 4); nvme_auth_hmac_update(&hmac, buf, 1); nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn)); nvme_auth_hmac_final(&hmac, response); From 1cc4cdae2a3b7730d462d69e30f213fd2efe7807 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 Apr 2026 09:14:02 -0700 Subject: [PATCH 21/22] nvme-pci: fix missed admin queue sq doorbell write We can batch admin commands submitted through io_uring_cmd passthrough, which means bd->last may be false and skips the doorbell write to aggregate multiple commands per write. If a subsequent command can't be dispatched for whatever reason, we have to provide the blk-mq ops' commit_rqs callback in order to ensure we properly update the doorbell. Fixes: 58e5bdeb9c2b ("nvme: enable uring-passthrough for admin commands") Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bcf68c198f74..f953237922da 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2239,6 +2239,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_admin_init_hctx, .init_request = nvme_pci_init_request, .timeout = nvme_timeout, From bd7b7ce96db4487bb77692a85ee4489fd2c395df Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 22 Apr 2026 12:06:36 -0700 Subject: [PATCH 22/22] nvme-auth: Hash DH shared secret to create session key The NVMe Base Specification 8.3.5.5.9 states that the session key Ks shall be computed from the ephemeral DH key by applying the hash function selected by the HashID parameter. The current implementation stores the raw DH shared secret as the session key without hashing it. This causes redundant hash operations: 1. Augmented challenge computation (section 8.3.5.5.4) requires Ca = HMAC(H(g^xy mod p), C). The code compensates by hashing the unhashed session key in nvme_auth_augmented_challenge() to produce the correct result. 2. PSK generation (section 8.3.5.5.9) requires PSK = HMAC(Ks, C1 || C2) where Ks should already be H(g^xy mod p). As the DH shared secret is always larger than the HMAC block size, HMAC internally hashes it before use, accidentally producing the correct result. When using secure channel concatenation with bidirectional authentication, this results in hashing the DH value three times: twice for augmented challenge calculations and once during PSK generation. Fix this by: - Modifying nvme_auth_gen_shared_secret() to hash the DH shared secret once after computation: Ks = H(g^xy mod p) - Removing the hash operation from nvme_auth_augmented_challenge() as the session key is now already hashed - Updating session key buffer size from DH key size to hash output size - Adding specification references in comments This avoid storing the raw DH shared secret and reduces the number of hash operations from three to one when using secure channel concatenation. Reviewed-by: Hannes Reinecke Reviewed-by: Eric Biggers Signed-off-by: Chris Leech Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 94 ++++++++++++++++++++++++++++++-------- drivers/nvme/host/auth.c | 13 +++--- drivers/nvme/target/auth.c | 15 +++--- include/linux/nvme-auth.h | 6 +-- 4 files changed, 92 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 2d325fb93083..77f1d22512f8 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -351,18 +351,29 @@ struct nvme_dhchap_key *nvme_auth_transform_key( } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); +/** + * nvme_auth_augmented_challenge() - Compute the augmented DH-HMAC-CHAP challenge + * @hmac_id: Hash algorithm identifier + * @skey: Session key + * @skey_len: Length of @skey + * @challenge: Challenge value + * @aug: Output buffer for the augmented challenge + * @hlen: Hash output length (length of @challenge and @aug) + * + * NVMe base specification 8.3.5.5.4: The augmented challenge is computed + * applying the HMAC function using the hash function H() selected by the + * HashID parameter ... with the hash of the ephemeral DH key ... as HMAC key + * to the challenge C (i.e., Ca = HMAC(H(g^xy mod p), C)). + * + * As the session key skey is already H(g^xy mod p) per section 8.3.5.5.9, use + * it directly as the HMAC key without additional hashing. + * + * Return: 0 on success, negative errno on failure. + */ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *challenge, u8 *aug, size_t hlen) { - u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE]; - int ret; - - ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key); - if (ret) - return ret; - ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug); - memzero_explicit(hashed_key, sizeof(hashed_key)); - return ret; + return nvme_auth_hmac(hmac_id, skey, skey_len, challenge, hlen, aug); } EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); @@ -403,33 +414,76 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, } EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len) +/** + * nvme_auth_gen_session_key() - Generate an ephemeral session key + * @dh_tfm: Diffie-Hellman transform with local private key already set + * @public_key: Peer's public key + * @public_key_len: Length of @public_key + * @sess_key: Output buffer for the session key + * @sess_key_len: Size of @sess_key buffer + * @hash_id: Hash algorithm identifier + * + * NVMe base specification 8.3.5.5.9: The session key Ks shall be computed from + * the ephemeral DH key (i.e., g^xy mod p) ... by applying the hash function + * H() selected by the HashID parameter ... (i.e., Ks = H(g^xy mod p)). + * + * Return: 0 on success, negative errno on failure. + */ +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id) { struct kpp_request *req; struct crypto_wait wait; struct scatterlist src, dst; + u8 *dh_secret; + size_t dh_secret_len, hash_len; int ret; - req = kpp_request_alloc(dh_tfm, GFP_KERNEL); - if (!req) + hash_len = nvme_auth_hmac_hash_len(hash_id); + if (!hash_len) { + pr_warn("%s: invalid hash algorithm %d\n", __func__, hash_id); + return -EINVAL; + } + + if (sess_key_len != hash_len) { + pr_warn("%s: sess_key buffer missized (%zu != %zu)\n", + __func__, sess_key_len, hash_len); + return -EINVAL; + } + + dh_secret_len = crypto_kpp_maxsize(dh_tfm); + dh_secret = kzalloc(dh_secret_len, GFP_KERNEL); + if (!dh_secret) return -ENOMEM; + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); + if (!req) { + ret = -ENOMEM; + goto out_free_secret; + } + crypto_init_wait(&wait); - sg_init_one(&src, ctrl_key, ctrl_key_len); - kpp_request_set_input(req, &src, ctrl_key_len); - sg_init_one(&dst, sess_key, sess_key_len); - kpp_request_set_output(req, &dst, sess_key_len); + sg_init_one(&src, public_key, public_key_len); + kpp_request_set_input(req, &src, public_key_len); + sg_init_one(&dst, dh_secret, dh_secret_len); + kpp_request_set_output(req, &dst, dh_secret_len); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait); - kpp_request_free(req); + + if (ret) + goto out_free_secret; + + ret = nvme_auth_hash(hash_id, dh_secret, dh_secret_len, sess_key); + +out_free_secret: + kfree_sensitive(dh_secret); return ret; } -EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); +EXPORT_SYMBOL_GPL(nvme_auth_gen_session_key); int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key) { diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 63f543e80998..16de4499a8e7 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -588,7 +588,7 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, } gen_sesskey: - chap->sess_key_len = chap->host_key_len; + chap->sess_key_len = chap->hash_len; chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL); if (!chap->sess_key) { chap->sess_key_len = 0; @@ -596,16 +596,17 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, return -ENOMEM; } - ret = nvme_auth_gen_shared_secret(chap->dh_tfm, - chap->ctrl_key, chap->ctrl_key_len, - chap->sess_key, chap->sess_key_len); + ret = nvme_auth_gen_session_key(chap->dh_tfm, + chap->ctrl_key, chap->ctrl_key_len, + chap->sess_key, chap->sess_key_len, + chap->hash_id); if (ret) { dev_dbg(ctrl->device, - "failed to generate shared secret, error %d\n", ret); + "failed to generate session key, error %d\n", ret); chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } - dev_dbg(ctrl->device, "shared secret %*ph\n", + dev_dbg(ctrl->device, "session key %*ph\n", (int)chap->sess_key_len, chap->sess_key); return 0; } diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index c35c427ca2ac..9a2eccdc8b13 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -447,18 +447,19 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, struct nvmet_ctrl *ctrl = req->sq->ctrl; int ret; - req->sq->dhchap_skey_len = ctrl->dh_keysize; + req->sq->dhchap_skey_len = nvme_auth_hmac_hash_len(ctrl->shash_id); req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL); if (!req->sq->dhchap_skey) return -ENOMEM; - ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm, - pkey, pkey_size, - req->sq->dhchap_skey, - req->sq->dhchap_skey_len); + ret = nvme_auth_gen_session_key(ctrl->dh_tfm, + pkey, pkey_size, + req->sq->dhchap_skey, + req->sq->dhchap_skey_len, + ctrl->shash_id); if (ret) - pr_debug("failed to compute shared secret, err %d\n", ret); + pr_debug("failed to compute session key, err %d\n", ret); else - pr_debug("%s: shared secret %*ph\n", __func__, + pr_debug("%s: session key %*ph\n", __func__, (int)req->sq->dhchap_skey_len, req->sq->dhchap_skey); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 184a1f9510fa..89902ae8b929 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -49,9 +49,9 @@ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len); +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id); int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len);