mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-06-03 17:22:28 -04:00
__netpoll_send_skb() always transmits through np->dev and queues busy
packets on np->dev->npinfo->txq, but it leaves skb->dev unchanged.
Stacked callers such as DSA and macvlan can reach netpoll with skb->dev
still naming the upper device while np->dev is the lower device that
owns the netpoll state.
If the skb has to be deferred, queue_process() later dequeues it from
the lower device's txq but retries it through skb->dev. That can
re-enter the upper ndo_start_xmit path on an already transformed skb,
and if the upper device disappears before the lower txq drains the
workqueue can dereference a stale skb->dev pointer.
The buggy scenario involves two paths, with each column showing the
order within that path:
path A label: netpoll enqueue path path B label: upper-device teardown
1. Stacked xmit calls netpoll 1. Teardown unregisters the upper
with lower np->dev and upper net_device while lower npinfo
skb->dev. stays alive.
2. __netpoll_send_skb() uses 2. netdev_release() runs for the
np->dev->npinfo as the txq upper net_device.
owner.
3. Busy transmit queues the skb 3. The lower txq still owns the
on that lower txq with upper deferred skb.
skb->dev.
4. queue_process() drains the 4. queue_process() dereferences
lower txq and reads skb->dev. that stale upper skb->dev.
Normalize skb->dev to np->dev after loading np->dev from the netpoll
instance, before either the direct transmit path or the fallback enqueue.
This keeps the queued skb in the same device and txq domain as the
netpoll state that owns it.
KASAN report as below:
KASAN slab-use-after-free in queue_process+0x7c/0x480
Workqueue: events queue_process
The buggy address belongs to the object at ffff88810906c000 which belongs
to the cache kmalloc-4k of size 4096
The buggy address is located 168 bytes inside of freed 4096-byte region
[ffff88810906c000, ffff88810906d000)
Read of size 8
Call trace:
dump_stack_lvl+0x73/0xb0 (?:?)
print_report+0xd1/0x620 (?:?)
srso_alias_return_thunk+0x5/0xfbef5 (?:?)
__virt_addr_valid+0x215/0x420 (?:?)
kasan_complete_mode_report_info+0x64/0x200 (?:?)
kasan_report+0xf7/0x130 (?:?)
queue_process+0x7c/0x480 (net/core/netpoll.c:88)
kasan_check_range+0x10c/0x1c0 (?:?)
__kasan_check_read+0x15/0x20 (?:?)
process_one_work+0x8b7/0x1af0 (kernel/workqueue.c:3200)
assign_work+0x170/0x3f0 (?:?)
worker_thread+0x574/0xf10 (?:?)
_raw_spin_unlock_irqrestore+0x4b/0x60 (?:?)
trace_hardirqs_on+0x2a/0x180 (?:?)
kthread+0x2fc/0x3f0 (?:?)
ret_from_fork+0x58b/0x830 (?:?)
__switch_to+0x58e/0xe90 (?:?)
__switch_to_asm+0x39/0x70 (?:?)
ret_from_fork_asm+0x1a/0x30 (?:?)
Freed by task stack:
kasan_save_stack+0x3d/0x60 (?:?)
kasan_save_track+0x18/0x40 (?:?)
kasan_save_free_info+0x3f/0x60 (?:?)
__kasan_slab_free+0x48/0x70 (?:?)
kfree+0x20e/0x4e0 (?:?)
kvfree+0x31/0x40 (?:?)
netdev_release+0x71/0x90 (net/core/net-sysfs.c:2227)
device_release+0xd2/0x250 (?:?)
kobject_put+0x181/0x4c0 (lib/kobject.c:730)
netdev_run_todo+0x700/0x1000 (net/core/dev.c:11666)
rtnl_dellink+0x396/0xc00 (net/core/rtnetlink.c:3558)
rtnetlink_rcv_msg+0x740/0xc20 (net/core/rtnetlink.c:6897)
netlink_rcv_skb+0x147/0x3a0 (?:?)
rtnetlink_rcv+0x19/0x20 (net/core/rtnetlink.c:7021)
netlink_unicast+0x4d1/0x830 (net/netlink/af_netlink.c:1327)
netlink_sendmsg+0x840/0xe10 (net/netlink/af_netlink.c:1812)
____sys_sendmsg+0x8a7/0xb50 (?:?)
___sys_sendmsg+0x104/0x190 (?:?)
__sys_sendmsg+0x135/0x1d0 (?:?)
__x64_sys_sendmsg+0x7b/0xc0 (?:?)
x64_sys_call+0x205c/0x2130 (?:?)
do_syscall_64+0x115/0x6a0 (arch/x86/entry/syscall_64.c:87)
entry_SYSCALL_64_after_hwframe+0x77/0x7f (?:?)
Fixes: 5de4a473bd ("netpoll queue cleanup")
Signed-off-by: Zhang Cen <rollkingzzc@gmail.com>
Link: https://patch.msgid.link/20260519104647.3517990-1-rollkingzzc@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
883 lines
20 KiB
C
883 lines
20 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Common framework for low-level network console, dump, and debugger code
|
|
*
|
|
* Sep 8 2003 Matt Mackall <mpm@selenic.com>
|
|
*
|
|
* based on the netconsole code from:
|
|
*
|
|
* Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
|
|
* Copyright (C) 2002 Red Hat, Inc.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/moduleparam.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/etherdevice.h>
|
|
#include <linux/string.h>
|
|
#include <linux/if_arp.h>
|
|
#include <linux/inetdevice.h>
|
|
#include <linux/inet.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/netpoll.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/export.h>
|
|
#include <linux/if_vlan.h>
|
|
#include <net/tcp.h>
|
|
#include <net/udp.h>
|
|
#include <net/addrconf.h>
|
|
#include <net/ndisc.h>
|
|
#include <net/ip6_checksum.h>
|
|
#include <linux/unaligned.h>
|
|
#include <trace/events/napi.h>
|
|
#include <linux/kconfig.h>
|
|
|
|
/*
|
|
* We maintain a small pool of fully-sized skbs, to make sure the
|
|
* message gets out even in extreme OOM situations.
|
|
*/
|
|
|
|
#define MAX_UDP_CHUNK 1460
|
|
#define MAX_SKBS 32
|
|
#define USEC_PER_POLL 50
|
|
|
|
#define MAX_SKB_SIZE \
|
|
(sizeof(struct ethhdr) + \
|
|
sizeof(struct iphdr) + \
|
|
sizeof(struct udphdr) + \
|
|
MAX_UDP_CHUNK)
|
|
|
|
static void zap_completion_queue(void);
|
|
|
|
static unsigned int carrier_timeout = 4;
|
|
module_param(carrier_timeout, uint, 0644);
|
|
|
|
static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
|
|
struct net_device *dev,
|
|
struct netdev_queue *txq)
|
|
{
|
|
netdev_tx_t status = NETDEV_TX_OK;
|
|
netdev_features_t features;
|
|
|
|
features = netif_skb_features(skb);
|
|
|
|
if (skb_vlan_tag_present(skb) &&
|
|
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
|
|
skb = __vlan_hwaccel_push_inside(skb);
|
|
if (unlikely(!skb)) {
|
|
/* This is actually a packet drop, but we
|
|
* don't want the code that calls this
|
|
* function to try and operate on a NULL skb.
|
|
*/
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
status = netdev_start_xmit(skb, dev, txq, false);
|
|
|
|
out:
|
|
return status;
|
|
}
|
|
|
|
static void queue_process(struct work_struct *work)
|
|
{
|
|
struct netpoll_info *npinfo =
|
|
container_of(work, struct netpoll_info, tx_work.work);
|
|
struct sk_buff *skb;
|
|
unsigned long flags;
|
|
|
|
while ((skb = skb_dequeue(&npinfo->txq))) {
|
|
struct net_device *dev = skb->dev;
|
|
struct netdev_queue *txq;
|
|
unsigned int q_index;
|
|
|
|
if (!netif_device_present(dev) || !netif_running(dev)) {
|
|
kfree_skb(skb);
|
|
continue;
|
|
}
|
|
|
|
local_irq_save(flags);
|
|
/* check if skb->queue_mapping is still valid */
|
|
q_index = skb_get_queue_mapping(skb);
|
|
if (unlikely(q_index >= dev->real_num_tx_queues)) {
|
|
q_index = q_index % dev->real_num_tx_queues;
|
|
skb_set_queue_mapping(skb, q_index);
|
|
}
|
|
txq = netdev_get_tx_queue(dev, q_index);
|
|
HARD_TX_LOCK(dev, txq, smp_processor_id());
|
|
if (netif_xmit_frozen_or_stopped(txq) ||
|
|
!dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
|
|
skb_queue_head(&npinfo->txq, skb);
|
|
HARD_TX_UNLOCK(dev, txq);
|
|
local_irq_restore(flags);
|
|
|
|
schedule_delayed_work(&npinfo->tx_work, HZ/10);
|
|
return;
|
|
}
|
|
HARD_TX_UNLOCK(dev, txq);
|
|
local_irq_restore(flags);
|
|
}
|
|
}
|
|
|
|
static int netif_local_xmit_active(struct net_device *dev)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < dev->num_tx_queues; i++) {
|
|
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
|
|
|
|
if (netif_tx_owned(txq, smp_processor_id()))
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void poll_one_napi(struct napi_struct *napi)
|
|
{
|
|
int work;
|
|
|
|
/* If we set this bit but see that it has already been set,
|
|
* that indicates that napi has been disabled and we need
|
|
* to abort this operation
|
|
*/
|
|
if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
|
|
return;
|
|
|
|
/* We explicitly pass the polling call a budget of 0 to
|
|
* indicate that we are clearing the Tx path only.
|
|
*/
|
|
work = napi->poll(napi, 0);
|
|
WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll);
|
|
trace_napi_poll(napi, work, 0);
|
|
|
|
clear_bit(NAPI_STATE_NPSVC, &napi->state);
|
|
}
|
|
|
|
static void poll_napi(struct net_device *dev)
|
|
{
|
|
struct napi_struct *napi;
|
|
int cpu = smp_processor_id();
|
|
|
|
list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
|
|
if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
|
|
poll_one_napi(napi);
|
|
smp_store_release(&napi->poll_owner, -1);
|
|
}
|
|
}
|
|
}
|
|
|
|
void netpoll_poll_dev(struct net_device *dev)
|
|
{
|
|
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
|
|
const struct net_device_ops *ops;
|
|
|
|
/* Don't do any rx activity if the dev_lock mutex is held
|
|
* the dev_open/close paths use this to block netpoll activity
|
|
* while changing device state
|
|
*/
|
|
if (!ni || down_trylock(&ni->dev_lock))
|
|
return;
|
|
|
|
/* Some drivers will take the same locks in poll and xmit,
|
|
* we can't poll if local CPU is already in xmit.
|
|
*/
|
|
if (!netif_running(dev) || netif_local_xmit_active(dev)) {
|
|
up(&ni->dev_lock);
|
|
return;
|
|
}
|
|
|
|
ops = dev->netdev_ops;
|
|
if (ops->ndo_poll_controller)
|
|
ops->ndo_poll_controller(dev);
|
|
|
|
poll_napi(dev);
|
|
|
|
up(&ni->dev_lock);
|
|
|
|
zap_completion_queue();
|
|
}
|
|
EXPORT_SYMBOL(netpoll_poll_dev);
|
|
|
|
void netpoll_poll_disable(struct net_device *dev)
|
|
{
|
|
struct netpoll_info *ni;
|
|
|
|
might_sleep();
|
|
ni = rtnl_dereference(dev->npinfo);
|
|
if (ni)
|
|
down(&ni->dev_lock);
|
|
}
|
|
|
|
void netpoll_poll_enable(struct net_device *dev)
|
|
{
|
|
struct netpoll_info *ni;
|
|
|
|
ni = rtnl_dereference(dev->npinfo);
|
|
if (ni)
|
|
up(&ni->dev_lock);
|
|
}
|
|
|
|
static void refill_skbs(struct netpoll *np)
|
|
{
|
|
struct sk_buff_head *skb_pool;
|
|
struct sk_buff *skb;
|
|
|
|
skb_pool = &np->skb_pool;
|
|
|
|
while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) {
|
|
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
|
|
if (!skb)
|
|
break;
|
|
|
|
skb_queue_tail(skb_pool, skb);
|
|
}
|
|
}
|
|
|
|
static void zap_completion_queue(void)
|
|
{
|
|
unsigned long flags;
|
|
struct softnet_data *sd = &get_cpu_var(softnet_data);
|
|
|
|
if (sd->completion_queue) {
|
|
struct sk_buff *clist;
|
|
|
|
local_irq_save(flags);
|
|
clist = sd->completion_queue;
|
|
sd->completion_queue = NULL;
|
|
local_irq_restore(flags);
|
|
|
|
while (clist != NULL) {
|
|
struct sk_buff *skb = clist;
|
|
clist = clist->next;
|
|
if (!skb_irq_freeable(skb)) {
|
|
refcount_set(&skb->users, 1);
|
|
dev_kfree_skb_any(skb); /* put this one back */
|
|
} else {
|
|
__kfree_skb(skb);
|
|
}
|
|
}
|
|
}
|
|
|
|
put_cpu_var(softnet_data);
|
|
}
|
|
|
|
static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
|
|
{
|
|
int count = 0;
|
|
struct sk_buff *skb;
|
|
|
|
zap_completion_queue();
|
|
repeat:
|
|
|
|
skb = alloc_skb(len, GFP_ATOMIC);
|
|
if (!skb) {
|
|
skb = skb_dequeue(&np->skb_pool);
|
|
schedule_work(&np->refill_wq);
|
|
}
|
|
|
|
if (!skb) {
|
|
if (++count < 10) {
|
|
netpoll_poll_dev(np->dev);
|
|
goto repeat;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
refcount_set(&skb->users, 1);
|
|
skb_reserve(skb, reserve);
|
|
return skb;
|
|
}
|
|
|
|
static int netpoll_owner_active(struct net_device *dev)
|
|
{
|
|
struct napi_struct *napi;
|
|
|
|
list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
|
|
if (READ_ONCE(napi->poll_owner) == smp_processor_id())
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* call with IRQ disabled */
|
|
static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
|
|
{
|
|
netdev_tx_t status = NETDEV_TX_BUSY;
|
|
netdev_tx_t ret = NET_XMIT_DROP;
|
|
struct net_device *dev;
|
|
unsigned long tries;
|
|
/* It is up to the caller to keep npinfo alive. */
|
|
struct netpoll_info *npinfo;
|
|
|
|
lockdep_assert_irqs_disabled();
|
|
|
|
dev = np->dev;
|
|
/* npinfo->txq belongs to np->dev, so retries must stay bound to it. */
|
|
skb->dev = dev;
|
|
rcu_read_lock();
|
|
npinfo = rcu_dereference_bh(dev->npinfo);
|
|
|
|
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
|
|
dev_kfree_skb_irq(skb);
|
|
goto out;
|
|
}
|
|
|
|
/* don't get messages out of order, and no recursion */
|
|
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
|
|
struct netdev_queue *txq;
|
|
|
|
txq = netdev_core_pick_tx(dev, skb, NULL);
|
|
|
|
/* try until next clock tick */
|
|
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
|
|
tries > 0; --tries) {
|
|
if (HARD_TX_TRYLOCK(dev, txq)) {
|
|
if (!netif_xmit_stopped(txq))
|
|
status = netpoll_start_xmit(skb, dev, txq);
|
|
|
|
HARD_TX_UNLOCK(dev, txq);
|
|
|
|
if (dev_xmit_complete(status))
|
|
break;
|
|
|
|
}
|
|
|
|
/* tickle device maybe there is some cleanup */
|
|
netpoll_poll_dev(np->dev);
|
|
|
|
udelay(USEC_PER_POLL);
|
|
}
|
|
|
|
WARN_ONCE(!irqs_disabled(),
|
|
"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n",
|
|
dev->name, dev->netdev_ops->ndo_start_xmit);
|
|
|
|
}
|
|
|
|
if (!dev_xmit_complete(status)) {
|
|
skb_queue_tail(&npinfo->txq, skb);
|
|
schedule_delayed_work(&npinfo->tx_work,0);
|
|
}
|
|
ret = NETDEV_TX_OK;
|
|
out:
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb,
|
|
int len)
|
|
{
|
|
struct udphdr *udph;
|
|
int udp_len;
|
|
|
|
udp_len = len + sizeof(struct udphdr);
|
|
udph = udp_hdr(skb);
|
|
|
|
/* check needs to be set, since it will be consumed in csum_partial */
|
|
udph->check = 0;
|
|
if (np->ipv6)
|
|
udph->check = csum_ipv6_magic(&np->local_ip.in6,
|
|
&np->remote_ip.in6,
|
|
udp_len, IPPROTO_UDP,
|
|
csum_partial(udph, udp_len, 0));
|
|
else
|
|
udph->check = csum_tcpudp_magic(np->local_ip.ip,
|
|
np->remote_ip.ip,
|
|
udp_len, IPPROTO_UDP,
|
|
csum_partial(udph, udp_len, 0));
|
|
if (udph->check == 0)
|
|
udph->check = CSUM_MANGLED_0;
|
|
}
|
|
|
|
netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
|
|
{
|
|
unsigned long flags;
|
|
netdev_tx_t ret;
|
|
|
|
if (unlikely(!np)) {
|
|
dev_kfree_skb_irq(skb);
|
|
ret = NET_XMIT_DROP;
|
|
} else {
|
|
local_irq_save(flags);
|
|
ret = __netpoll_send_skb(np, skb);
|
|
local_irq_restore(flags);
|
|
}
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(netpoll_send_skb);
|
|
|
|
static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len)
|
|
{
|
|
struct ipv6hdr *ip6h;
|
|
|
|
skb_push(skb, sizeof(struct ipv6hdr));
|
|
skb_reset_network_header(skb);
|
|
ip6h = ipv6_hdr(skb);
|
|
|
|
/* ip6h->version = 6; ip6h->priority = 0; */
|
|
*(unsigned char *)ip6h = 0x60;
|
|
ip6h->flow_lbl[0] = 0;
|
|
ip6h->flow_lbl[1] = 0;
|
|
ip6h->flow_lbl[2] = 0;
|
|
|
|
ip6h->payload_len = htons(sizeof(struct udphdr) + len);
|
|
ip6h->nexthdr = IPPROTO_UDP;
|
|
ip6h->hop_limit = 32;
|
|
ip6h->saddr = np->local_ip.in6;
|
|
ip6h->daddr = np->remote_ip.in6;
|
|
|
|
skb->protocol = htons(ETH_P_IPV6);
|
|
}
|
|
|
|
static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len)
|
|
{
|
|
static atomic_t ip_ident;
|
|
struct iphdr *iph;
|
|
int ip_len;
|
|
|
|
ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr);
|
|
|
|
skb_push(skb, sizeof(struct iphdr));
|
|
skb_reset_network_header(skb);
|
|
iph = ip_hdr(skb);
|
|
|
|
/* iph->version = 4; iph->ihl = 5; */
|
|
*(unsigned char *)iph = 0x45;
|
|
iph->tos = 0;
|
|
put_unaligned(htons(ip_len), &iph->tot_len);
|
|
iph->id = htons(atomic_inc_return(&ip_ident));
|
|
iph->frag_off = 0;
|
|
iph->ttl = 64;
|
|
iph->protocol = IPPROTO_UDP;
|
|
iph->check = 0;
|
|
put_unaligned(np->local_ip.ip, &iph->saddr);
|
|
put_unaligned(np->remote_ip.ip, &iph->daddr);
|
|
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
|
|
skb->protocol = htons(ETH_P_IP);
|
|
}
|
|
|
|
static void push_udp(struct netpoll *np, struct sk_buff *skb, int len)
|
|
{
|
|
struct udphdr *udph;
|
|
int udp_len;
|
|
|
|
udp_len = len + sizeof(struct udphdr);
|
|
|
|
skb_push(skb, sizeof(struct udphdr));
|
|
skb_reset_transport_header(skb);
|
|
|
|
udph = udp_hdr(skb);
|
|
udph->source = htons(np->local_port);
|
|
udph->dest = htons(np->remote_port);
|
|
udph->len = htons(udp_len);
|
|
|
|
netpoll_udp_checksum(np, skb, len);
|
|
}
|
|
|
|
static void push_eth(struct netpoll *np, struct sk_buff *skb)
|
|
{
|
|
struct ethhdr *eth;
|
|
|
|
eth = skb_push(skb, ETH_HLEN);
|
|
skb_reset_mac_header(skb);
|
|
ether_addr_copy(eth->h_source, np->dev->dev_addr);
|
|
ether_addr_copy(eth->h_dest, np->remote_mac);
|
|
if (np->ipv6)
|
|
eth->h_proto = htons(ETH_P_IPV6);
|
|
else
|
|
eth->h_proto = htons(ETH_P_IP);
|
|
}
|
|
|
|
int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
|
|
{
|
|
int total_len, ip_len, udp_len;
|
|
struct sk_buff *skb;
|
|
|
|
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
WARN_ON_ONCE(!irqs_disabled());
|
|
|
|
udp_len = len + sizeof(struct udphdr);
|
|
if (np->ipv6)
|
|
ip_len = udp_len + sizeof(struct ipv6hdr);
|
|
else
|
|
ip_len = udp_len + sizeof(struct iphdr);
|
|
|
|
total_len = ip_len + LL_RESERVED_SPACE(np->dev);
|
|
|
|
skb = find_skb(np, total_len + np->dev->needed_tailroom,
|
|
total_len - len);
|
|
if (!skb)
|
|
return -ENOMEM;
|
|
|
|
skb_copy_to_linear_data(skb, msg, len);
|
|
skb_put(skb, len);
|
|
|
|
push_udp(np, skb, len);
|
|
if (np->ipv6)
|
|
push_ipv6(np, skb, len);
|
|
else
|
|
push_ipv4(np, skb, len);
|
|
push_eth(np, skb);
|
|
skb->dev = np->dev;
|
|
|
|
return (int)netpoll_send_skb(np, skb);
|
|
}
|
|
EXPORT_SYMBOL(netpoll_send_udp);
|
|
|
|
|
|
static void skb_pool_flush(struct netpoll *np)
|
|
{
|
|
struct sk_buff_head *skb_pool;
|
|
|
|
cancel_work_sync(&np->refill_wq);
|
|
skb_pool = &np->skb_pool;
|
|
skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
|
|
}
|
|
|
|
static void refill_skbs_work_handler(struct work_struct *work)
|
|
{
|
|
struct netpoll *np =
|
|
container_of(work, struct netpoll, refill_wq);
|
|
|
|
refill_skbs(np);
|
|
}
|
|
|
|
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
|
|
{
|
|
struct netpoll_info *npinfo;
|
|
const struct net_device_ops *ops;
|
|
int err;
|
|
|
|
skb_queue_head_init(&np->skb_pool);
|
|
INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
|
|
|
|
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
|
|
np_err(np, "%s doesn't support polling, aborting\n",
|
|
ndev->name);
|
|
err = -ENOTSUPP;
|
|
goto out;
|
|
}
|
|
|
|
npinfo = rtnl_dereference(ndev->npinfo);
|
|
if (!npinfo) {
|
|
npinfo = kmalloc_obj(*npinfo);
|
|
if (!npinfo) {
|
|
err = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
sema_init(&npinfo->dev_lock, 1);
|
|
skb_queue_head_init(&npinfo->txq);
|
|
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
|
|
|
|
refcount_set(&npinfo->refcnt, 1);
|
|
|
|
ops = ndev->netdev_ops;
|
|
if (ops->ndo_netpoll_setup) {
|
|
err = ops->ndo_netpoll_setup(ndev);
|
|
if (err)
|
|
goto free_npinfo;
|
|
}
|
|
} else {
|
|
refcount_inc(&npinfo->refcnt);
|
|
}
|
|
|
|
np->dev = ndev;
|
|
strscpy(np->dev_name, ndev->name, IFNAMSIZ);
|
|
|
|
/* fill up the skb queue */
|
|
refill_skbs(np);
|
|
|
|
/* last thing to do is link it to the net device structure */
|
|
rcu_assign_pointer(ndev->npinfo, npinfo);
|
|
|
|
return 0;
|
|
|
|
free_npinfo:
|
|
kfree(npinfo);
|
|
out:
|
|
return err;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__netpoll_setup);
|
|
|
|
/*
|
|
* Returns a pointer to a string representation of the identifier used
|
|
* to select the egress interface for the given netpoll instance. buf
|
|
* is used to format np->dev_mac when np->dev_name is empty; bufsz must
|
|
* be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address
|
|
* and its NUL terminator.
|
|
*/
|
|
static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz)
|
|
{
|
|
if (np->dev_name[0])
|
|
return np->dev_name;
|
|
|
|
snprintf(buf, bufsz, "%pM", np->dev_mac);
|
|
return buf;
|
|
}
|
|
|
|
static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev,
|
|
unsigned int timeout)
|
|
{
|
|
unsigned long atmost;
|
|
|
|
atmost = jiffies + timeout * HZ;
|
|
while (!netif_carrier_ok(ndev)) {
|
|
if (time_after(jiffies, atmost)) {
|
|
np_notice(np, "timeout waiting for carrier\n");
|
|
break;
|
|
}
|
|
msleep(1);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Take the IPv6 from ndev and populate local_ip structure in netpoll
|
|
*/
|
|
static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
|
|
{
|
|
char buf[MAC_ADDR_STR_LEN + 1];
|
|
int err = -EDESTADDRREQ;
|
|
struct inet6_dev *idev;
|
|
|
|
if (!IS_ENABLED(CONFIG_IPV6)) {
|
|
np_err(np, "IPv6 is not supported %s, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
return -EINVAL;
|
|
}
|
|
|
|
idev = __in6_dev_get(ndev);
|
|
if (idev) {
|
|
struct inet6_ifaddr *ifp;
|
|
|
|
read_lock_bh(&idev->lock);
|
|
list_for_each_entry(ifp, &idev->addr_list, if_list) {
|
|
if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) !=
|
|
!!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
|
|
continue;
|
|
/* Got the IP, let's return */
|
|
np->local_ip.in6 = ifp->addr;
|
|
err = 0;
|
|
break;
|
|
}
|
|
read_unlock_bh(&idev->lock);
|
|
}
|
|
if (err) {
|
|
np_err(np, "no IPv6 address for %s, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
return err;
|
|
}
|
|
|
|
np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Take the IPv4 from ndev and populate local_ip structure in netpoll
|
|
*/
|
|
static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
|
|
{
|
|
char buf[MAC_ADDR_STR_LEN + 1];
|
|
const struct in_ifaddr *ifa;
|
|
struct in_device *in_dev;
|
|
|
|
in_dev = __in_dev_get_rtnl(ndev);
|
|
if (!in_dev) {
|
|
np_err(np, "no IP address for %s, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
return -EDESTADDRREQ;
|
|
}
|
|
|
|
ifa = rtnl_dereference(in_dev->ifa_list);
|
|
if (!ifa) {
|
|
np_err(np, "no IP address for %s, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
return -EDESTADDRREQ;
|
|
}
|
|
|
|
np->local_ip.ip = ifa->ifa_local;
|
|
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Test whether the caller left np->local_ip unset, so that
|
|
* netpoll_setup() should auto-populate it from the egress device.
|
|
*
|
|
* np->local_ip is a union of __be32 (IPv4) and struct in6_addr (IPv6),
|
|
* so an IPv6 address whose first 4 bytes are zero (e.g. ::1, ::2,
|
|
* IPv4-mapped ::ffff:a.b.c.d) must not be tested via the IPv4 arm —
|
|
* doing so would misclassify a caller-supplied address as unset and
|
|
* silently overwrite it with whatever address the device exposes.
|
|
*/
|
|
static bool netpoll_local_ip_unset(const struct netpoll *np)
|
|
{
|
|
if (np->ipv6)
|
|
return ipv6_addr_any(&np->local_ip.in6);
|
|
return !np->local_ip.ip;
|
|
}
|
|
|
|
int netpoll_setup(struct netpoll *np)
|
|
{
|
|
struct net *net = current->nsproxy->net_ns;
|
|
char buf[MAC_ADDR_STR_LEN + 1];
|
|
struct net_device *ndev = NULL;
|
|
bool ip_overwritten = false;
|
|
int err;
|
|
|
|
rtnl_lock();
|
|
if (np->dev_name[0])
|
|
ndev = __dev_get_by_name(net, np->dev_name);
|
|
else if (is_valid_ether_addr(np->dev_mac))
|
|
ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
|
|
|
|
if (!ndev) {
|
|
np_err(np, "%s doesn't exist, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
err = -ENODEV;
|
|
goto unlock;
|
|
}
|
|
netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
|
|
|
|
if (netdev_master_upper_dev_get(ndev)) {
|
|
np_err(np, "%s is a slave device, aborting\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
err = -EBUSY;
|
|
goto put;
|
|
}
|
|
|
|
if (!netif_running(ndev)) {
|
|
np_info(np, "device %s not up yet, forcing it\n",
|
|
egress_dev(np, buf, sizeof(buf)));
|
|
|
|
err = dev_open(ndev, NULL);
|
|
if (err) {
|
|
np_err(np, "failed to open %s\n", ndev->name);
|
|
goto put;
|
|
}
|
|
|
|
rtnl_unlock();
|
|
netpoll_wait_carrier(np, ndev, carrier_timeout);
|
|
rtnl_lock();
|
|
}
|
|
|
|
if (netpoll_local_ip_unset(np)) {
|
|
if (!np->ipv6) {
|
|
err = netpoll_take_ipv4(np, ndev);
|
|
if (err)
|
|
goto put;
|
|
} else {
|
|
err = netpoll_take_ipv6(np, ndev);
|
|
if (err)
|
|
goto put;
|
|
}
|
|
ip_overwritten = true;
|
|
}
|
|
|
|
err = __netpoll_setup(np, ndev);
|
|
if (err)
|
|
goto flush;
|
|
rtnl_unlock();
|
|
|
|
/* Make sure all NAPI polls which started before dev->npinfo
|
|
* was visible have exited before we start calling NAPI poll.
|
|
* NAPI skips locking if dev->npinfo is NULL.
|
|
*/
|
|
synchronize_rcu();
|
|
|
|
return 0;
|
|
|
|
flush:
|
|
skb_pool_flush(np);
|
|
put:
|
|
DEBUG_NET_WARN_ON_ONCE(np->dev);
|
|
if (ip_overwritten)
|
|
memset(&np->local_ip, 0, sizeof(np->local_ip));
|
|
netdev_put(ndev, &np->dev_tracker);
|
|
unlock:
|
|
rtnl_unlock();
|
|
return err;
|
|
}
|
|
EXPORT_SYMBOL(netpoll_setup);
|
|
|
|
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
|
|
{
|
|
struct netpoll_info *npinfo =
|
|
container_of(rcu_head, struct netpoll_info, rcu);
|
|
|
|
skb_queue_purge(&npinfo->txq);
|
|
|
|
/* we can't call cancel_delayed_work_sync here, as we are in softirq */
|
|
cancel_delayed_work(&npinfo->tx_work);
|
|
|
|
/* clean after last, unfinished work */
|
|
__skb_queue_purge(&npinfo->txq);
|
|
/* now cancel it again */
|
|
cancel_delayed_work(&npinfo->tx_work);
|
|
kfree(npinfo);
|
|
}
|
|
|
|
static void __netpoll_cleanup(struct netpoll *np)
|
|
{
|
|
struct netpoll_info *npinfo;
|
|
|
|
npinfo = rtnl_dereference(np->dev->npinfo);
|
|
if (!npinfo)
|
|
return;
|
|
|
|
/* At this point, there is a single npinfo instance per netdevice, and
|
|
* its refcnt tracks how many netpoll structures are linked to it. We
|
|
* only perform npinfo cleanup when the refcnt decrements to zero.
|
|
*/
|
|
if (refcount_dec_and_test(&npinfo->refcnt)) {
|
|
const struct net_device_ops *ops;
|
|
|
|
ops = np->dev->netdev_ops;
|
|
if (ops->ndo_netpoll_cleanup)
|
|
ops->ndo_netpoll_cleanup(np->dev);
|
|
|
|
RCU_INIT_POINTER(np->dev->npinfo, NULL);
|
|
call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
|
|
}
|
|
|
|
skb_pool_flush(np);
|
|
}
|
|
|
|
void __netpoll_free(struct netpoll *np)
|
|
{
|
|
ASSERT_RTNL();
|
|
|
|
/* Wait for transmitting packets to finish before freeing. */
|
|
synchronize_net();
|
|
__netpoll_cleanup(np);
|
|
kfree(np);
|
|
}
|
|
EXPORT_SYMBOL_GPL(__netpoll_free);
|
|
|
|
void do_netpoll_cleanup(struct netpoll *np)
|
|
{
|
|
__netpoll_cleanup(np);
|
|
netdev_put(np->dev, &np->dev_tracker);
|
|
np->dev = NULL;
|
|
}
|
|
EXPORT_SYMBOL(do_netpoll_cleanup);
|
|
|
|
void netpoll_cleanup(struct netpoll *np)
|
|
{
|
|
rtnl_lock();
|
|
if (!np->dev)
|
|
goto out;
|
|
do_netpoll_cleanup(np);
|
|
out:
|
|
rtnl_unlock();
|
|
}
|
|
EXPORT_SYMBOL(netpoll_cleanup);
|