Files
linux/drivers/uio/uio_hv_generic.c
Naman Jain b15b7d2a1b uio_hv_generic: Let userspace take care of interrupt mask
Remove the logic to set interrupt mask by default in uio_hv_generic
driver as the interrupt mask value is supposed to be controlled
completely by the user space. If the mask bit gets changed
by the driver, concurrently with user mode operating on the ring,
the mask bit may be set when it is supposed to be clear, and the
user-mode driver will miss an interrupt which will cause a hang.

For eg- when the driver sets inbound ring buffer interrupt mask to 1,
the host does not interrupt the guest on the UIO VMBus channel.
However, setting the mask does not prevent the host from putting a
message in the inbound ring buffer. So let’s assume that happens,
the host puts a message into the ring buffer but does not interrupt.

Subsequently, the user space code in the guest sets the inbound ring
buffer interrupt mask to 0, saying “Hey, I’m ready for interrupts”.
User space code then calls pread() to wait for an interrupt.
Then one of two things happens:

* The host never sends another message. So the pread() waits forever.
* The host does send another message. But because there’s already a
  message in the ring buffer, it doesn’t generate an interrupt.
  This is the correct behavior, because the host should only send an
  interrupt when the inbound ring buffer transitions from empty to
  not-empty. Adding an additional message to a ring buffer that is not
  empty is not supposed to generate an interrupt on the guest.
  Since the guest is waiting in pread() and not removing messages from
  the ring buffer, the pread() waits forever.

This could be easily reproduced in hv_fcopy_uio_daemon if we delay
setting interrupt mask to 0.

Similarly if hv_uio_channel_cb() sets the interrupt_mask to 1,
there’s a race condition. Once user space empties the inbound ring
buffer, but before user space sets interrupt_mask to 0, the host could
put another message in the ring buffer but it wouldn’t interrupt.
Then the next pread() would hang.

Fix these by removing all instances where interrupt_mask is changed,
while keeping the one in set_event() unchanged to enable userspace
control the interrupt mask by writing 0/1 to /dev/uioX.

Fixes: 95096f2fbd ("uio-hv-generic: new userspace i/o driver for VMBus")
Suggested-by: John Starks <jostarks@microsoft.com>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
Cc: stable@vger.kernel.org
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Long Li <longli@microsoft.com>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
Tested-by: Tianyu Lan <tiala@microsoft.com>
Link: https://lore.kernel.org/r/20250828044200.492030-1-namjain@linux.microsoft.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2025-09-06 15:57:21 +02:00

424 lines
11 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* uio_hv_generic - generic UIO driver for VMBus
*
* Copyright (c) 2013-2016 Brocade Communications Systems, Inc.
* Copyright (c) 2016, Microsoft Corporation.
*
* Since the driver does not declare any device ids, you must allocate
* id and bind the device to the driver yourself. For example:
*
* Associate Network GUID with UIO device
* # echo "f8615163-df3e-46c5-913f-f2d2f965ed0e" \
* > /sys/bus/vmbus/drivers/uio_hv_generic/new_id
* Then rebind
* # echo -n "ed963694-e847-4b2a-85af-bc9cfc11d6f3" \
* > /sys/bus/vmbus/drivers/hv_netvsc/unbind
* # echo -n "ed963694-e847-4b2a-85af-bc9cfc11d6f3" \
* > /sys/bus/vmbus/drivers/uio_hv_generic/bind
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/uio_driver.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
#include <linux/skbuff.h>
#include <linux/hyperv.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include "../hv/hyperv_vmbus.h"
#define DRIVER_VERSION "0.02.1"
#define DRIVER_AUTHOR "Stephen Hemminger <sthemmin at microsoft.com>"
#define DRIVER_DESC "Generic UIO driver for VMBus devices"
#define SEND_BUFFER_SIZE (16 * 1024 * 1024)
#define RECV_BUFFER_SIZE (31 * 1024 * 1024)
/*
* List of resources to be mapped to user space
* can be extended up to MAX_UIO_MAPS(5) items
*/
enum hv_uio_map {
TXRX_RING_MAP = 0,
INT_PAGE_MAP,
MON_PAGE_MAP,
RECV_BUF_MAP,
SEND_BUF_MAP
};
struct hv_uio_private_data {
struct uio_info info;
struct hv_device *device;
atomic_t refcnt;
void *recv_buf;
struct vmbus_gpadl recv_gpadl;
char recv_name[32]; /* "recv_4294967295" */
void *send_buf;
struct vmbus_gpadl send_gpadl;
char send_name[32];
};
static void set_event(struct vmbus_channel *channel, s32 irq_state)
{
channel->inbound.ring_buffer->interrupt_mask = !irq_state;
if (!channel->offermsg.monitor_allocated && irq_state) {
/* MB is needed for host to see the interrupt mask first */
virt_mb();
vmbus_set_event(channel);
}
}
/*
* This is the irqcontrol callback to be registered to uio_info.
* It can be used to disable/enable interrupt from user space processes.
*
* @param info
* pointer to uio_info.
* @param irq_state
* state value. 1 to enable interrupt, 0 to disable interrupt.
*/
static int
hv_uio_irqcontrol(struct uio_info *info, s32 irq_state)
{
struct hv_uio_private_data *pdata = info->priv;
struct hv_device *dev = pdata->device;
struct vmbus_channel *primary, *sc;
primary = dev->channel;
set_event(primary, irq_state);
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(sc, &primary->sc_list, sc_list)
set_event(sc, irq_state);
mutex_unlock(&vmbus_connection.channel_mutex);
return 0;
}
/*
* Callback from vmbus_event when something is in inbound ring.
*/
static void hv_uio_channel_cb(void *context)
{
struct vmbus_channel *chan = context;
struct hv_device *hv_dev;
struct hv_uio_private_data *pdata;
virt_mb();
/*
* The callback may come from a subchannel, in which case look
* for the hv device in the primary channel
*/
hv_dev = chan->primary_channel ?
chan->primary_channel->device_obj : chan->device_obj;
pdata = hv_get_drvdata(hv_dev);
uio_event_notify(&pdata->info);
}
/*
* Callback from vmbus_event when channel is rescinded.
* It is meant for rescind of primary channels only.
*/
static void hv_uio_rescind(struct vmbus_channel *channel)
{
struct hv_device *hv_dev = channel->device_obj;
struct hv_uio_private_data *pdata = hv_get_drvdata(hv_dev);
/*
* Turn off the interrupt file handle
* Next read for event will return -EIO
*/
pdata->info.irq = 0;
/* Wake up reader */
uio_event_notify(&pdata->info);
/*
* With rescind callback registered, rescind path will not unregister the device
* from vmbus when the primary channel is rescinded.
* Without it, rescind handling is incomplete and next onoffer msg does not come.
* Unregister the device from vmbus here.
*/
vmbus_device_unregister(channel->device_obj);
}
/* Function used for mmap of ring buffer sysfs interface.
* The ring buffer is allocated as contiguous memory by vmbus_open
*/
static int
hv_uio_ring_mmap(struct vmbus_channel *channel, struct vm_area_struct *vma)
{
void *ring_buffer = page_address(channel->ringbuffer_page);
if (channel->state != CHANNEL_OPENED_STATE)
return -ENODEV;
return vm_iomap_memory(vma, virt_to_phys(ring_buffer),
channel->ringbuffer_pagecount << PAGE_SHIFT);
}
/* Callback from VMBUS subsystem when new channel created. */
static void
hv_uio_new_channel(struct vmbus_channel *new_sc)
{
struct hv_device *hv_dev = new_sc->primary_channel->device_obj;
struct device *device = &hv_dev->device;
const size_t ring_bytes = SZ_2M;
int ret;
/* Create host communication ring */
ret = vmbus_open(new_sc, ring_bytes, ring_bytes, NULL, 0,
hv_uio_channel_cb, new_sc);
if (ret) {
dev_err(device, "vmbus_open subchannel failed: %d\n", ret);
return;
}
set_channel_read_mode(new_sc, HV_CALL_ISR);
ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap);
if (ret) {
dev_err(device, "sysfs create ring bin file failed; %d\n", ret);
vmbus_close(new_sc);
}
}
/* free the reserved buffers for send and receive */
static void
hv_uio_cleanup(struct hv_device *dev, struct hv_uio_private_data *pdata)
{
if (pdata->send_gpadl.gpadl_handle) {
vmbus_teardown_gpadl(dev->channel, &pdata->send_gpadl);
if (!pdata->send_gpadl.decrypted)
vfree(pdata->send_buf);
}
if (pdata->recv_gpadl.gpadl_handle) {
vmbus_teardown_gpadl(dev->channel, &pdata->recv_gpadl);
if (!pdata->recv_gpadl.decrypted)
vfree(pdata->recv_buf);
}
}
/* VMBus primary channel is opened on first use */
static int
hv_uio_open(struct uio_info *info, struct inode *inode)
{
struct hv_uio_private_data *pdata
= container_of(info, struct hv_uio_private_data, info);
struct hv_device *dev = pdata->device;
int ret;
if (atomic_inc_return(&pdata->refcnt) != 1)
return 0;
vmbus_set_chn_rescind_callback(dev->channel, hv_uio_rescind);
vmbus_set_sc_create_callback(dev->channel, hv_uio_new_channel);
ret = vmbus_connect_ring(dev->channel,
hv_uio_channel_cb, dev->channel);
if (ret)
atomic_dec(&pdata->refcnt);
return ret;
}
/* VMBus primary channel is closed on last close */
static int
hv_uio_release(struct uio_info *info, struct inode *inode)
{
struct hv_uio_private_data *pdata
= container_of(info, struct hv_uio_private_data, info);
struct hv_device *dev = pdata->device;
int ret = 0;
if (atomic_dec_and_test(&pdata->refcnt))
ret = vmbus_disconnect_ring(dev->channel);
return ret;
}
static int
hv_uio_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
struct vmbus_channel *channel = dev->channel;
struct hv_uio_private_data *pdata;
void *ring_buffer;
int ret;
size_t ring_size = hv_dev_ring_size(channel);
if (!ring_size)
ring_size = SZ_2M;
/* Adjust ring size if necessary to have it page aligned */
ring_size = VMBUS_RING_SIZE(ring_size);
pdata = devm_kzalloc(&dev->device, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
return -ENOMEM;
ret = vmbus_alloc_ring(channel, ring_size, ring_size);
if (ret)
return ret;
set_channel_read_mode(channel, HV_CALL_ISR);
/* Fill general uio info */
pdata->info.name = "uio_hv_generic";
pdata->info.version = DRIVER_VERSION;
pdata->info.irqcontrol = hv_uio_irqcontrol;
pdata->info.open = hv_uio_open;
pdata->info.release = hv_uio_release;
pdata->info.irq = UIO_IRQ_CUSTOM;
atomic_set(&pdata->refcnt, 0);
/* mem resources */
pdata->info.mem[TXRX_RING_MAP].name = "txrx_rings";
ring_buffer = page_address(channel->ringbuffer_page);
pdata->info.mem[TXRX_RING_MAP].addr
= (uintptr_t)virt_to_phys(ring_buffer);
pdata->info.mem[TXRX_RING_MAP].size
= channel->ringbuffer_pagecount << PAGE_SHIFT;
pdata->info.mem[TXRX_RING_MAP].memtype = UIO_MEM_IOVA;
pdata->info.mem[INT_PAGE_MAP].name = "int_page";
pdata->info.mem[INT_PAGE_MAP].addr
= (uintptr_t)vmbus_connection.int_page;
pdata->info.mem[INT_PAGE_MAP].size = HV_HYP_PAGE_SIZE;
pdata->info.mem[INT_PAGE_MAP].memtype = UIO_MEM_LOGICAL;
pdata->info.mem[MON_PAGE_MAP].name = "monitor_page";
pdata->info.mem[MON_PAGE_MAP].addr
= (uintptr_t)vmbus_connection.monitor_pages[1];
pdata->info.mem[MON_PAGE_MAP].size = HV_HYP_PAGE_SIZE;
pdata->info.mem[MON_PAGE_MAP].memtype = UIO_MEM_LOGICAL;
if (channel->device_id == HV_NIC) {
pdata->recv_buf = vzalloc(RECV_BUFFER_SIZE);
if (!pdata->recv_buf) {
ret = -ENOMEM;
goto fail_free_ring;
}
ret = vmbus_establish_gpadl(channel, pdata->recv_buf,
RECV_BUFFER_SIZE, &pdata->recv_gpadl);
if (ret) {
if (!pdata->recv_gpadl.decrypted)
vfree(pdata->recv_buf);
goto fail_close;
}
/* put Global Physical Address Label in name */
snprintf(pdata->recv_name, sizeof(pdata->recv_name),
"recv:%u", pdata->recv_gpadl.gpadl_handle);
pdata->info.mem[RECV_BUF_MAP].name = pdata->recv_name;
pdata->info.mem[RECV_BUF_MAP].addr = (uintptr_t)pdata->recv_buf;
pdata->info.mem[RECV_BUF_MAP].size = RECV_BUFFER_SIZE;
pdata->info.mem[RECV_BUF_MAP].memtype = UIO_MEM_VIRTUAL;
pdata->send_buf = vzalloc(SEND_BUFFER_SIZE);
if (!pdata->send_buf) {
ret = -ENOMEM;
goto fail_close;
}
ret = vmbus_establish_gpadl(channel, pdata->send_buf,
SEND_BUFFER_SIZE, &pdata->send_gpadl);
if (ret) {
if (!pdata->send_gpadl.decrypted)
vfree(pdata->send_buf);
goto fail_close;
}
snprintf(pdata->send_name, sizeof(pdata->send_name),
"send:%u", pdata->send_gpadl.gpadl_handle);
pdata->info.mem[SEND_BUF_MAP].name = pdata->send_name;
pdata->info.mem[SEND_BUF_MAP].addr = (uintptr_t)pdata->send_buf;
pdata->info.mem[SEND_BUF_MAP].size = SEND_BUFFER_SIZE;
pdata->info.mem[SEND_BUF_MAP].memtype = UIO_MEM_VIRTUAL;
}
pdata->info.priv = pdata;
pdata->device = dev;
ret = uio_register_device(&dev->device, &pdata->info);
if (ret) {
dev_err(&dev->device, "hv_uio register failed\n");
goto fail_close;
}
/*
* This internally calls sysfs_update_group, which returns a non-zero value if it executes
* before sysfs_create_group. This is expected as the 'ring' will be created later in
* vmbus_device_register() -> vmbus_add_channel_kobj(). Thus, no need to check the return
* value and print warning.
*
* Creating/exposing sysfs in driver probe is not encouraged as it can lead to race
* conditions with userspace. For backward compatibility, "ring" sysfs could not be removed
* or decoupled from uio_hv_generic probe. Userspace programs can make use of inotify
* APIs to make sure that ring is created.
*/
hv_create_ring_sysfs(channel, hv_uio_ring_mmap);
hv_set_drvdata(dev, pdata);
return 0;
fail_close:
hv_uio_cleanup(dev, pdata);
fail_free_ring:
vmbus_free_ring(dev->channel);
return ret;
}
static void
hv_uio_remove(struct hv_device *dev)
{
struct hv_uio_private_data *pdata = hv_get_drvdata(dev);
if (!pdata)
return;
hv_remove_ring_sysfs(dev->channel);
uio_unregister_device(&pdata->info);
hv_uio_cleanup(dev, pdata);
vmbus_free_ring(dev->channel);
}
static struct hv_driver hv_uio_drv = {
.name = "uio_hv_generic",
.id_table = NULL, /* only dynamic id's */
.probe = hv_uio_probe,
.remove = hv_uio_remove,
};
static int __init
hyperv_module_init(void)
{
return vmbus_driver_register(&hv_uio_drv);
}
static void __exit
hyperv_module_exit(void)
{
vmbus_driver_unregister(&hv_uio_drv);
}
module_init(hyperv_module_init);
module_exit(hyperv_module_exit);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);