mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 13:41:48 -04:00
Merge tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Florian Westphal says: ==================== netfilter: updates for net-next The following patchset contains Netfilter updates for *net-next*, including changes to IPv6 stack and updates to IPVS from Julian Anastasov. 1) ipv6: export fib6_lookup for nft_fib_ipv6 module 2) factor out ipv6_anycast_destination logic so its usable without dst_entry. These are dependencies for patch 3. 3) switch nft_fib_ipv6 module to no longer need temporary dst_entry object allocations by using fib6_lookup() + RCU. This gets us ~13% higher packet rate in my tests. Patches 4 to 8, from Eric Dumazet, zap sk_callback_lock usage in netfilter. Patch 9 removes another sk_callback_lock instance. Remaining patches, from Julian Anastasov, improve IPVS, Quoting Julian: * Add infrastructure for resizable hash tables based on hlist_bl. * Change the 256-bucket service hash table to be resizable. * Change the global connection table to be per-net and resizable. * Make connection hashing more secure for setups with multiple services. netfilter pull request nf-next-26-03-04 * tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: ipvs: use more keys for connection hashing ipvs: switch to per-net connection table ipvs: use resizable hash table for services ipvs: add resizable hash tables rculist_bl: add hlist_bl_for_each_entry_continue_rcu netfilter: nfnetlink_queue: remove locking in nfqnl_get_sk_secctx netfilter: nfnetlink_queue: no longer acquire sk_callback_lock netfilter: nfnetlink_log: no longer acquire sk_callback_lock netfilter: nft_meta: no longer acquire sk_callback_lock in nft_meta_get_eval_skugid() netfilter: xt_owner: no longer acquire sk_callback_lock in mt_owner() netfilter: nf_log_syslog: no longer acquire sk_callback_lock in nf_log_dump_sk_uid_gid() netfilter: nft_fib_ipv6: switch to fib6_lookup ipv6: make ipv6_anycast_destination logic usable without dst_entry ipv6: export fib6_lookup for nft_fib_ipv6 ==================== Link: https://patch.msgid.link/20260304114921.31042-1-fw@strlen.de Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
@@ -8,21 +8,31 @@
|
||||
#include <linux/list_bl.h>
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
/* return the first ptr or next element in an RCU protected list */
|
||||
#define hlist_bl_first_rcu(head) \
|
||||
(*((struct hlist_bl_node __rcu **)(&(head)->first)))
|
||||
#define hlist_bl_next_rcu(node) \
|
||||
(*((struct hlist_bl_node __rcu **)(&(node)->next)))
|
||||
|
||||
static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
|
||||
struct hlist_bl_node *n)
|
||||
{
|
||||
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
|
||||
LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
|
||||
LIST_BL_LOCKMASK);
|
||||
rcu_assign_pointer(h->first,
|
||||
rcu_assign_pointer(hlist_bl_first_rcu(h),
|
||||
(struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
|
||||
}
|
||||
|
||||
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
|
||||
{
|
||||
return (struct hlist_bl_node *)
|
||||
((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
|
||||
}
|
||||
#define hlist_bl_first_rcu_dereference(head) \
|
||||
({ \
|
||||
struct hlist_bl_head *__head = (head); \
|
||||
\
|
||||
(struct hlist_bl_node *) \
|
||||
((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \
|
||||
hlist_bl_is_locked(__head)) & \
|
||||
~LIST_BL_LOCKMASK); \
|
||||
})
|
||||
|
||||
/**
|
||||
* hlist_bl_del_rcu - deletes entry from hash list without re-initialization
|
||||
@@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
|
||||
{
|
||||
struct hlist_bl_node *first;
|
||||
|
||||
/* don't need hlist_bl_first_rcu because we're under lock */
|
||||
/* don't need hlist_bl_first_rcu* because we're under lock */
|
||||
first = hlist_bl_first(h);
|
||||
|
||||
n->next = first;
|
||||
@@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
|
||||
*
|
||||
*/
|
||||
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \
|
||||
for (pos = hlist_bl_first_rcu(head); \
|
||||
for (pos = hlist_bl_first_rcu_dereference(head); \
|
||||
pos && \
|
||||
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
|
||||
pos = rcu_dereference_raw(pos->next))
|
||||
pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
|
||||
|
||||
/**
|
||||
* hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given
|
||||
* type
|
||||
* @tpos: the type * to use as a loop cursor.
|
||||
* @pos: the &struct hlist_bl_node to use as a loop cursor.
|
||||
* @member: the name of the hlist_bl_node within the struct.
|
||||
*
|
||||
* Continue to iterate over list of given type, continuing after
|
||||
* the current position which must have been in the list when the RCU read
|
||||
* lock was taken.
|
||||
* This would typically require either that you obtained the node from a
|
||||
* previous walk of the list in the same RCU read-side critical section, or
|
||||
* that you held some sort of non-RCU reference (such as a reference count)
|
||||
* to keep the node alive *and* in the list.
|
||||
*/
|
||||
#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \
|
||||
for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \
|
||||
pos && \
|
||||
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
|
||||
pos = rcu_dereference_raw(hlist_bl_next_rcu(pos)))
|
||||
|
||||
#endif
|
||||
|
||||
@@ -252,15 +252,22 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
|
||||
return rt->rt6i_flags & RTF_LOCAL;
|
||||
}
|
||||
|
||||
static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst,
|
||||
u32 rt6i_flags,
|
||||
const struct in6_addr *daddr)
|
||||
{
|
||||
return rt6i_flags & RTF_ANYCAST ||
|
||||
(rt6i_dst->plen < 127 &&
|
||||
!(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
|
||||
ipv6_addr_equal(&rt6i_dst->addr, daddr));
|
||||
}
|
||||
|
||||
static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
|
||||
const struct in6_addr *daddr)
|
||||
{
|
||||
const struct rt6_info *rt = dst_rt6_info(dst);
|
||||
|
||||
return rt->rt6i_flags & RTF_ANYCAST ||
|
||||
(rt->rt6i_dst.plen < 127 &&
|
||||
!(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
|
||||
ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
|
||||
return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr);
|
||||
}
|
||||
|
||||
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <asm/types.h> /* for __uXX types */
|
||||
|
||||
#include <linux/list.h> /* for struct list_head */
|
||||
#include <linux/rculist_bl.h> /* for struct hlist_bl_head */
|
||||
#include <linux/spinlock.h> /* for struct rwlock_t */
|
||||
#include <linux/atomic.h> /* for struct atomic_t */
|
||||
#include <linux/refcount.h> /* for struct refcount_t */
|
||||
@@ -30,15 +31,22 @@
|
||||
#endif
|
||||
#include <net/net_namespace.h> /* Netw namespace */
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/siphash.h>
|
||||
|
||||
#define IP_VS_HDR_INVERSE 1
|
||||
#define IP_VS_HDR_ICMP 2
|
||||
/*
|
||||
* Hash table: for virtual service lookups
|
||||
*/
|
||||
#define IP_VS_SVC_TAB_BITS 8
|
||||
#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS)
|
||||
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
|
||||
|
||||
/* conn_tab limits (as per Kconfig) */
|
||||
#define IP_VS_CONN_TAB_MIN_BITS 8
|
||||
#if BITS_PER_LONG > 32
|
||||
#define IP_VS_CONN_TAB_MAX_BITS 27
|
||||
#else
|
||||
#define IP_VS_CONN_TAB_MAX_BITS 20
|
||||
#endif
|
||||
|
||||
/* svc_table limits */
|
||||
#define IP_VS_SVC_TAB_MIN_BITS 4
|
||||
#define IP_VS_SVC_TAB_MAX_BITS 20
|
||||
|
||||
/* Generic access of ipvs struct */
|
||||
static inline struct netns_ipvs *net_ipvs(struct net* net)
|
||||
@@ -49,8 +57,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
|
||||
/* Connections' size value needed by ip_vs_ctl.c */
|
||||
extern int ip_vs_conn_tab_size;
|
||||
|
||||
extern struct mutex __ip_vs_mutex;
|
||||
|
||||
struct ip_vs_iphdr {
|
||||
int hdr_flags; /* ipvs flags */
|
||||
__u32 off; /* Where IP or IPv4 header starts */
|
||||
@@ -271,6 +277,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
|
||||
pr_err(msg, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
struct ip_vs_aligned_lock {
|
||||
spinlock_t l; /* Protect buckets */
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* For arrays per family */
|
||||
enum {
|
||||
IP_VS_AF_INET,
|
||||
@@ -283,6 +293,13 @@ static inline int ip_vs_af_index(int af)
|
||||
return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
|
||||
}
|
||||
|
||||
/* work_flags */
|
||||
enum {
|
||||
IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */
|
||||
IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */
|
||||
IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */
|
||||
};
|
||||
|
||||
/* The port number of FTP service (in network order). */
|
||||
#define FTPPORT cpu_to_be16(21)
|
||||
#define FTPDATA cpu_to_be16(20)
|
||||
@@ -484,6 +501,198 @@ struct ip_vs_est_kt_data {
|
||||
int est_row; /* estimated row */
|
||||
};
|
||||
|
||||
/* IPVS resizable hash tables */
|
||||
struct ip_vs_rht {
|
||||
struct hlist_bl_head *buckets;
|
||||
struct ip_vs_rht __rcu *new_tbl; /* New/Same table */
|
||||
seqcount_t *seqc; /* Protects moves */
|
||||
struct ip_vs_aligned_lock *lock; /* Protect seqc */
|
||||
int mask; /* Buckets mask */
|
||||
int size; /* Buckets */
|
||||
int seqc_mask; /* seqc mask */
|
||||
int lock_mask; /* lock mask */
|
||||
u32 table_id;
|
||||
int u_thresh; /* upper threshold */
|
||||
int l_thresh; /* lower threshold */
|
||||
int lfactor; /* Load Factor (shift)*/
|
||||
int bits; /* size = 1 << bits */
|
||||
siphash_key_t hash_key;
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_table() - Walk the hash tables
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @p: previous table, temp struct ip_vs_rht *var
|
||||
*
|
||||
* Walk tables assuming others can not change the installed tables
|
||||
*/
|
||||
#define ip_vs_rht_for_each_table(table, t, p) \
|
||||
for (p = NULL, t = rcu_dereference_protected(table, 1); \
|
||||
t != p; \
|
||||
p = t, t = rcu_dereference_protected(t->new_tbl, 1))
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @p: previous table, temp struct ip_vs_rht *var
|
||||
*
|
||||
* We usually search in one table and also in second table on resizing
|
||||
*/
|
||||
#define ip_vs_rht_for_each_table_rcu(table, t, p) \
|
||||
for (p = NULL, t = rcu_dereference(table); \
|
||||
t != p; \
|
||||
p = t, t = rcu_dereference(t->new_tbl))
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_bucket() - Walk all table buckets
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @bucket: bucket index, used as cursor, u32 var
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*/
|
||||
#define ip_vs_rht_for_each_bucket(t, bucket, head) \
|
||||
for (bucket = 0, head = (t)->buckets; \
|
||||
bucket < t->size; bucket++, head++)
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @bucket: index of current bucket or hash key
|
||||
* @sc: temp seqcount_t *var
|
||||
* @seq: temp unsigned int var for sequence count
|
||||
* @retry: temp int var
|
||||
*/
|
||||
#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \
|
||||
for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \
|
||||
retry && ({ seq = read_seqcount_begin(sc); 1; }); \
|
||||
retry = read_seqcount_retry(sc, seq))
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
unsigned int _seq; \
|
||||
seqcount_t *_sc; \
|
||||
u32 _bucket; \
|
||||
int _retry
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete/move entries
|
||||
* Not suitable if duplicates are not desired
|
||||
* Possible cases for reader that uses cond_resched_rcu() in the loop:
|
||||
* - new table can not be installed, no need to repeat
|
||||
* - new table can be installed => check and repeat if new table is
|
||||
* installed, needed for !PREEMPT_RCU
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets_rcu(table, head) \
|
||||
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head) \
|
||||
ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \
|
||||
_seq, _retry)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_bucket_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \
|
||||
unsigned int _seq; \
|
||||
seqcount_t *_sc; \
|
||||
int _retry
|
||||
/**
|
||||
* ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
|
||||
* @t: current table, struct ip_vs_rht *var
|
||||
* @bucket: index of current bucket or hash key
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete/move entries
|
||||
* Not suitable if duplicates are not desired
|
||||
* Possible cases for reader that uses cond_resched_rcu() in the loop:
|
||||
* - new table can not be installed, no need to repeat
|
||||
* - new table can be installed => check and repeat if new table is
|
||||
* installed, needed for !PREEMPT_RCU
|
||||
*/
|
||||
#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \
|
||||
if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \
|
||||
{} \
|
||||
else \
|
||||
ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets_safe_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
u32 _bucket
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete entries but moving is disabled
|
||||
* Using cond_resched_rcu() should be safe if tables do not change
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \
|
||||
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
u32 _bucket
|
||||
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets() - Walk all buckets
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Use if others can not add/delete/move entries
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets(table, head) \
|
||||
ip_vs_rht_for_each_table(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head)
|
||||
|
||||
/* Entries can be in one of two tables, so we flip bit when new table is
|
||||
* created and store it as highest bit in hash keys
|
||||
*/
|
||||
#define IP_VS_RHT_TABLE_ID_MASK BIT(31)
|
||||
|
||||
/* Check if hash key is from this table */
|
||||
static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
|
||||
{
|
||||
return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
|
||||
}
|
||||
|
||||
/* Build per-table hash key from hash value */
|
||||
static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
|
||||
{
|
||||
return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
|
||||
}
|
||||
|
||||
void ip_vs_rht_free(struct ip_vs_rht *t);
|
||||
void ip_vs_rht_rcu_free(struct rcu_head *head);
|
||||
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
|
||||
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
|
||||
int lfactor, int min_bits, int max_bits);
|
||||
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
|
||||
int min_bits, int max_bits);
|
||||
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
|
||||
const union nf_inet_addr *addr, u32 v1, u32 v2);
|
||||
|
||||
struct dst_entry;
|
||||
struct iphdr;
|
||||
struct ip_vs_conn;
|
||||
@@ -577,42 +786,72 @@ struct ip_vs_conn_param {
|
||||
__u8 pe_data_len;
|
||||
};
|
||||
|
||||
/* Hash node in conn_tab */
|
||||
struct ip_vs_conn_hnode {
|
||||
struct hlist_bl_node node; /* node in conn_tab */
|
||||
u32 hash_key; /* Key for the hash table */
|
||||
u8 dir; /* 0=out->in, 1=in->out */
|
||||
} __packed;
|
||||
|
||||
/* IP_VS structure allocated for each dynamically scheduled connection */
|
||||
struct ip_vs_conn {
|
||||
struct hlist_node c_list; /* hashed list heads */
|
||||
/* Protocol, addresses and port numbers */
|
||||
/* Cacheline for hash table nodes - rarely modified */
|
||||
|
||||
struct ip_vs_conn_hnode hn0; /* Original direction */
|
||||
u8 af; /* address family */
|
||||
__be16 cport;
|
||||
struct ip_vs_conn_hnode hn1; /* Reply direction */
|
||||
u8 daf; /* Address family of the dest */
|
||||
__be16 dport;
|
||||
__be16 vport;
|
||||
u16 af; /* address family */
|
||||
union nf_inet_addr caddr; /* client address */
|
||||
union nf_inet_addr vaddr; /* virtual address */
|
||||
union nf_inet_addr daddr; /* destination address */
|
||||
struct ip_vs_dest *dest; /* real server */
|
||||
atomic_t n_control; /* Number of controlled ones */
|
||||
volatile __u32 flags; /* status flags */
|
||||
__u16 protocol; /* Which protocol (TCP/UDP) */
|
||||
__u16 daf; /* Address family of the dest */
|
||||
struct netns_ipvs *ipvs;
|
||||
/* 44/64 */
|
||||
|
||||
/* counter and timer */
|
||||
refcount_t refcnt; /* reference count */
|
||||
struct timer_list timer; /* Expiration timer */
|
||||
volatile unsigned long timeout; /* timeout */
|
||||
|
||||
/* Flags and state transition */
|
||||
spinlock_t lock; /* lock for state transition */
|
||||
struct ip_vs_conn *control; /* Master control connection */
|
||||
const struct ip_vs_pe *pe;
|
||||
char *pe_data;
|
||||
__u8 pe_data_len;
|
||||
volatile __u16 state; /* state info */
|
||||
volatile __u16 old_state; /* old state, to be used for
|
||||
* state transition triggered
|
||||
* synchronization
|
||||
*/
|
||||
__u32 fwmark; /* Fire wall mark from skb */
|
||||
unsigned long sync_endtime; /* jiffies + sent_retries */
|
||||
/* 2-byte hole */
|
||||
/* 64/96 */
|
||||
|
||||
/* Control members */
|
||||
struct ip_vs_conn *control; /* Master control connection */
|
||||
atomic_t n_control; /* Number of controlled ones */
|
||||
struct ip_vs_dest *dest; /* real server */
|
||||
union nf_inet_addr caddr; /* client address */
|
||||
union nf_inet_addr vaddr; /* virtual address */
|
||||
/* 96/128 */
|
||||
|
||||
union nf_inet_addr daddr; /* destination address */
|
||||
__u32 fwmark; /* Fire wall mark from skb */
|
||||
__be16 vport;
|
||||
__u16 protocol; /* Which protocol (TCP/UDP) */
|
||||
|
||||
/* Note: we can group the following members into a structure,
|
||||
* in order to save more space, and the following members are
|
||||
* only used in VS/NAT anyway
|
||||
*/
|
||||
struct ip_vs_app *app; /* bound ip_vs_app object */
|
||||
void *app_data; /* Application private data */
|
||||
/* 128/168 */
|
||||
struct_group(sync_conn_opt,
|
||||
struct ip_vs_seq in_seq; /* incoming seq. struct */
|
||||
struct ip_vs_seq out_seq; /* outgoing seq. struct */
|
||||
);
|
||||
/* 152/192 */
|
||||
|
||||
struct timer_list timer; /* Expiration timer */
|
||||
volatile unsigned long timeout; /* timeout */
|
||||
spinlock_t lock; /* lock for state transition */
|
||||
refcount_t refcnt; /* reference count */
|
||||
atomic_t in_pkts; /* incoming packet counter */
|
||||
/* 64-bit: 4-byte gap */
|
||||
|
||||
/* 188/256 */
|
||||
unsigned long sync_endtime; /* jiffies + sent_retries */
|
||||
struct netns_ipvs *ipvs;
|
||||
|
||||
/* Packet transmitter for different forwarding methods. If it
|
||||
* mangles the packet, it must return NF_DROP or better NF_STOLEN,
|
||||
@@ -622,21 +861,6 @@ struct ip_vs_conn {
|
||||
int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
|
||||
struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
|
||||
|
||||
/* Note: we can group the following members into a structure,
|
||||
* in order to save more space, and the following members are
|
||||
* only used in VS/NAT anyway
|
||||
*/
|
||||
struct ip_vs_app *app; /* bound ip_vs_app object */
|
||||
void *app_data; /* Application private data */
|
||||
struct_group(sync_conn_opt,
|
||||
struct ip_vs_seq in_seq; /* incoming seq. struct */
|
||||
struct ip_vs_seq out_seq; /* outgoing seq. struct */
|
||||
);
|
||||
|
||||
const struct ip_vs_pe *pe;
|
||||
char *pe_data;
|
||||
__u8 pe_data_len;
|
||||
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
|
||||
@@ -691,14 +915,15 @@ struct ip_vs_dest_user_kern {
|
||||
* forwarding entries.
|
||||
*/
|
||||
struct ip_vs_service {
|
||||
struct hlist_node s_list; /* node in service table */
|
||||
atomic_t refcnt; /* reference counter */
|
||||
|
||||
struct hlist_bl_node s_list; /* node in service table */
|
||||
u32 hash_key; /* Key for the hash table */
|
||||
u16 af; /* address family */
|
||||
__u16 protocol; /* which protocol (TCP/UDP) */
|
||||
|
||||
union nf_inet_addr addr; /* IP address for virtual service */
|
||||
__be16 port; /* port number for the service */
|
||||
__u32 fwmark; /* firewall mark of the service */
|
||||
atomic_t refcnt; /* reference counter */
|
||||
__be16 port; /* port number for the service */
|
||||
unsigned int flags; /* service status flags */
|
||||
unsigned int timeout; /* persistent timeout in ticks */
|
||||
__be32 netmask; /* grouping granularity, mask/plen */
|
||||
@@ -808,8 +1033,8 @@ struct ip_vs_pe {
|
||||
int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
|
||||
bool (*ct_match)(const struct ip_vs_conn_param *p,
|
||||
struct ip_vs_conn *ct);
|
||||
u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
|
||||
bool inverse);
|
||||
u32 (*hashkey_raw)(const struct ip_vs_conn_param *p,
|
||||
struct ip_vs_rht *t, bool inverse);
|
||||
int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
|
||||
/* create connections for real-server outgoing packets */
|
||||
struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
|
||||
@@ -949,6 +1174,7 @@ struct netns_ipvs {
|
||||
/* ip_vs_conn */
|
||||
atomic_t conn_count; /* connection counter */
|
||||
atomic_t no_cport_conns[IP_VS_AF_MAX];
|
||||
struct delayed_work conn_resize_work;/* resize conn_tab */
|
||||
|
||||
/* ip_vs_ctl */
|
||||
struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */
|
||||
@@ -957,6 +1183,10 @@ struct netns_ipvs {
|
||||
struct list_head dest_trash;
|
||||
spinlock_t dest_trash_lock;
|
||||
struct timer_list dest_trash_timer; /* expiration timer */
|
||||
struct mutex service_mutex; /* service reconfig */
|
||||
struct rw_semaphore svc_resize_sem; /* svc_table resizing */
|
||||
struct delayed_work svc_resize_work; /* resize svc_table */
|
||||
atomic_t svc_table_changes;/* ++ on new table */
|
||||
/* Service counters */
|
||||
atomic_t num_services[IP_VS_AF_MAX]; /* Services */
|
||||
atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */
|
||||
@@ -1021,6 +1251,8 @@ struct netns_ipvs {
|
||||
int sysctl_est_nice; /* kthread nice */
|
||||
int est_stopped; /* stop tasks */
|
||||
#endif
|
||||
int sysctl_conn_lfactor;
|
||||
int sysctl_svc_lfactor;
|
||||
|
||||
/* ip_vs_lblc */
|
||||
int sysctl_lblc_expiration;
|
||||
@@ -1030,6 +1262,7 @@ struct netns_ipvs {
|
||||
int sysctl_lblcr_expiration;
|
||||
struct ctl_table_header *lblcr_ctl_header;
|
||||
struct ctl_table *lblcr_ctl_table;
|
||||
unsigned long work_flags; /* IP_VS_WORK_* flags */
|
||||
/* ip_vs_est */
|
||||
struct delayed_work est_reload_work;/* Reload kthread tasks */
|
||||
struct mutex est_mutex; /* protect kthread tasks */
|
||||
@@ -1061,9 +1294,9 @@ struct netns_ipvs {
|
||||
unsigned int mixed_address_family_dests;
|
||||
unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */
|
||||
|
||||
/* the service mutex that protect svc_table and svc_fwm_table */
|
||||
struct mutex service_mutex;
|
||||
struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */
|
||||
struct ip_vs_rht __rcu *svc_table; /* Services */
|
||||
struct ip_vs_rht __rcu *conn_tab; /* Connections */
|
||||
atomic_t conn_tab_changes;/* ++ on new table */
|
||||
};
|
||||
|
||||
#define DEFAULT_SYNC_THRESHOLD 3
|
||||
@@ -1313,6 +1546,24 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
|
||||
|
||||
#endif
|
||||
|
||||
/* Get load factor to map conn_count/u_thresh to t->size */
|
||||
static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs)
|
||||
{
|
||||
return READ_ONCE(ipvs->sysctl_conn_lfactor);
|
||||
}
|
||||
|
||||
/* Get load factor to map num_services/u_thresh to t->size
|
||||
* Smaller value decreases u_thresh to reduce collisions but increases
|
||||
* the table size
|
||||
* Returns factor where:
|
||||
* - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load
|
||||
* - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load
|
||||
*/
|
||||
static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs)
|
||||
{
|
||||
return READ_ONCE(ipvs->sysctl_svc_lfactor);
|
||||
}
|
||||
|
||||
/* IPVS core functions
|
||||
* (from ip_vs_core.c)
|
||||
*/
|
||||
@@ -1386,6 +1637,23 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
|
||||
}
|
||||
void ip_vs_conn_put(struct ip_vs_conn *cp);
|
||||
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
|
||||
int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
|
||||
int lfactor);
|
||||
struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
|
||||
int lfactor);
|
||||
|
||||
static inline struct ip_vs_conn *
|
||||
ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn)
|
||||
{
|
||||
return container_of(hn, struct ip_vs_conn, hn0);
|
||||
}
|
||||
|
||||
static inline struct ip_vs_conn *
|
||||
ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn)
|
||||
{
|
||||
return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) :
|
||||
container_of(hn, struct ip_vs_conn, hn0);
|
||||
}
|
||||
|
||||
struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
|
||||
const union nf_inet_addr *daddr,
|
||||
@@ -1739,6 +2007,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp)
|
||||
return fwd;
|
||||
}
|
||||
|
||||
/* Check if connection uses double hashing */
|
||||
static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp)
|
||||
{
|
||||
return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ &&
|
||||
!(cp->flags & IP_VS_CONN_F_TEMPLATE);
|
||||
}
|
||||
|
||||
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
struct ip_vs_conn *cp, int dir);
|
||||
|
||||
|
||||
@@ -92,6 +92,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
|
||||
|
||||
return err;
|
||||
}
|
||||
#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
|
||||
EXPORT_SYMBOL_GPL(fib6_lookup);
|
||||
#endif
|
||||
|
||||
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
|
||||
const struct sk_buff *skb,
|
||||
|
||||
@@ -342,6 +342,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
|
||||
return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
|
||||
res, flags);
|
||||
}
|
||||
#if IS_MODULE(CONFIG_NFT_FIB_IPV6)
|
||||
EXPORT_SYMBOL_GPL(fib6_lookup);
|
||||
#endif
|
||||
|
||||
static void __net_init fib6_tables_init(struct net *net)
|
||||
{
|
||||
|
||||
@@ -52,7 +52,13 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
|
||||
fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
|
||||
fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
|
||||
|
||||
return lookup_flags;
|
||||
return lookup_flags | RT6_LOOKUP_F_DST_NOREF;
|
||||
}
|
||||
|
||||
static int nft_fib6_lookup(struct net *net, struct flowi6 *fl6,
|
||||
struct fib6_result *res, int flags)
|
||||
{
|
||||
return fib6_lookup(net, fl6->flowi6_oif, fl6, res, flags);
|
||||
}
|
||||
|
||||
static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
|
||||
@@ -60,13 +66,14 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
|
||||
struct ipv6hdr *iph)
|
||||
{
|
||||
const struct net_device *dev = NULL;
|
||||
struct fib6_result res = {};
|
||||
int route_err, addrtype;
|
||||
struct rt6_info *rt;
|
||||
struct flowi6 fl6 = {
|
||||
.flowi6_iif = LOOPBACK_IFINDEX,
|
||||
.flowi6_proto = pkt->tprot,
|
||||
.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
|
||||
};
|
||||
int lookup_flags;
|
||||
u32 ret = 0;
|
||||
|
||||
if (priv->flags & NFTA_FIB_F_IIF)
|
||||
@@ -74,29 +81,23 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
|
||||
else if (priv->flags & NFTA_FIB_F_OIF)
|
||||
dev = nft_out(pkt);
|
||||
|
||||
nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
|
||||
lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
|
||||
|
||||
if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
|
||||
ret = RTN_LOCAL;
|
||||
|
||||
route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt,
|
||||
flowi6_to_flowi(&fl6), false);
|
||||
route_err = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags);
|
||||
if (route_err)
|
||||
goto err;
|
||||
|
||||
if (rt->rt6i_flags & RTF_REJECT) {
|
||||
route_err = rt->dst.error;
|
||||
dst_release(&rt->dst);
|
||||
goto err;
|
||||
}
|
||||
if (res.fib6_flags & RTF_REJECT)
|
||||
return res.fib6_type;
|
||||
|
||||
if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr))
|
||||
if (__ipv6_anycast_destination(&res.f6i->fib6_dst, res.fib6_flags, &fl6.daddr))
|
||||
ret = RTN_ANYCAST;
|
||||
else if (!dev && rt->rt6i_flags & RTF_LOCAL)
|
||||
else if (!dev && res.fib6_flags & RTF_LOCAL)
|
||||
ret = RTN_LOCAL;
|
||||
|
||||
dst_release(&rt->dst);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -152,6 +153,33 @@ static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const str
|
||||
return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL;
|
||||
}
|
||||
|
||||
static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev,
|
||||
const struct net_device *dev)
|
||||
{
|
||||
return nh_dev == dev ||
|
||||
l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex;
|
||||
}
|
||||
|
||||
static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt,
|
||||
const struct net_device *dev)
|
||||
{
|
||||
const struct net_device *nh_dev;
|
||||
struct fib6_info *iter;
|
||||
|
||||
nh_dev = fib6_info_nh_dev(rt);
|
||||
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
|
||||
return true;
|
||||
|
||||
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
|
||||
nh_dev = fib6_info_nh_dev(iter);
|
||||
|
||||
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
|
||||
const struct nft_pktinfo *pkt)
|
||||
{
|
||||
@@ -160,14 +188,14 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
|
||||
const struct net_device *found = NULL;
|
||||
const struct net_device *oif = NULL;
|
||||
u32 *dest = ®s->data[priv->dreg];
|
||||
struct fib6_result res = {};
|
||||
struct ipv6hdr *iph, _iph;
|
||||
struct flowi6 fl6 = {
|
||||
.flowi6_iif = LOOPBACK_IFINDEX,
|
||||
.flowi6_proto = pkt->tprot,
|
||||
.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
|
||||
};
|
||||
struct rt6_info *rt;
|
||||
int lookup_flags;
|
||||
int lookup_flags, ret;
|
||||
|
||||
if (nft_fib_can_skip(pkt)) {
|
||||
nft_fib_store_result(dest, priv, nft_in(pkt));
|
||||
@@ -193,26 +221,17 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
|
||||
lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
|
||||
|
||||
*dest = 0;
|
||||
rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb,
|
||||
lookup_flags);
|
||||
if (rt->dst.error)
|
||||
goto put_rt_err;
|
||||
|
||||
/* Should not see RTF_LOCAL here */
|
||||
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
|
||||
goto put_rt_err;
|
||||
ret = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags);
|
||||
if (ret || res.fib6_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
|
||||
return;
|
||||
|
||||
if (!oif) {
|
||||
found = rt->rt6i_idev->dev;
|
||||
found = fib6_info_nh_dev(res.f6i);
|
||||
} else {
|
||||
if (oif == rt->rt6i_idev->dev ||
|
||||
l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
|
||||
if (nft_fib6_info_nh_uses_dev(res.f6i, oif))
|
||||
found = oif;
|
||||
}
|
||||
|
||||
nft_fib_store_result(dest, priv, found);
|
||||
put_rt_err:
|
||||
ip6_rt_put(rt);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nft_fib6_eval);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows)
|
||||
INIT_LIST_HEAD(&table[rows]);
|
||||
}
|
||||
|
||||
/* IPVS Resizable Hash Tables:
|
||||
* - list_bl buckets with bit lock
|
||||
*
|
||||
* Goals:
|
||||
* - RCU lookup for entry can run in parallel with add/del/move operations
|
||||
* - hash keys can be on non-contiguous memory
|
||||
* - support entries with duplicate keys
|
||||
* - unlink entries without lookup, use the saved table and bucket id
|
||||
* - resizing can trigger on load change or depending on key refresh period
|
||||
* - customizable load factor to balance between speed and memory usage
|
||||
* - add/del/move operations should be allowed for any context
|
||||
*
|
||||
* Resizing:
|
||||
* - new table is attached to the current table and all entries are moved
|
||||
* with new hash key. Finally, the new table is installed as current one and
|
||||
* the old table is released after RCU grace period.
|
||||
* - RCU read-side critical sections will walk two tables while resizing is
|
||||
* in progress
|
||||
* - new entries are added to the new table
|
||||
* - entries will be deleted from the old or from the new table, the table_id
|
||||
* can be saved into entry as part of the hash key to know where the entry is
|
||||
* hashed
|
||||
* - move operations may delay readers or to cause retry for the modified
|
||||
* bucket. As result, searched entry will be found but walkers that operate
|
||||
* on multiple entries may see same entry twice if bucket walking is retried.
|
||||
* - for fast path the number of entries (load) can be compared to u_thresh
|
||||
* and l_thresh to decide when to trigger table growing/shrinking. They
|
||||
* are calculated based on load factor (shift count), negative value allows
|
||||
* load to be below 100% to reduce collisions by maintaining larger table
|
||||
* while positive value tolerates collisions by using smaller table and load
|
||||
* above 100%: u_thresh(load) = size * (2 ^ lfactor)
|
||||
*
|
||||
* Locking:
|
||||
* - lock: protect seqc if other context except resizer can move entries
|
||||
* - seqc: seqcount_t, delay/retry readers while entries are moved to
|
||||
* new table on resizing
|
||||
* - bit lock: serialize bucket modifications
|
||||
* - writers may use other locking mechanisms to serialize operations for
|
||||
* resizing, moving and installing new tables
|
||||
*/
|
||||
|
||||
void ip_vs_rht_free(struct ip_vs_rht *t)
|
||||
{
|
||||
kvfree(t->buckets);
|
||||
kvfree(t->seqc);
|
||||
kvfree(t->lock);
|
||||
kfree(t);
|
||||
}
|
||||
|
||||
void ip_vs_rht_rcu_free(struct rcu_head *head)
|
||||
{
|
||||
struct ip_vs_rht *t;
|
||||
|
||||
t = container_of(head, struct ip_vs_rht, rcu_head);
|
||||
ip_vs_rht_free(t);
|
||||
}
|
||||
|
||||
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks)
|
||||
{
|
||||
struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL);
|
||||
int i;
|
||||
|
||||
if (!t)
|
||||
return NULL;
|
||||
if (scounts) {
|
||||
int ml = roundup_pow_of_two(nr_cpu_ids);
|
||||
|
||||
scounts = min(scounts, buckets);
|
||||
scounts = min(scounts, ml);
|
||||
t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL);
|
||||
if (!t->seqc)
|
||||
goto err;
|
||||
for (i = 0; i < scounts; i++)
|
||||
seqcount_init(&t->seqc[i]);
|
||||
|
||||
if (locks) {
|
||||
locks = min(locks, scounts);
|
||||
t->lock = kvmalloc_array(locks, sizeof(*t->lock),
|
||||
GFP_KERNEL);
|
||||
if (!t->lock)
|
||||
goto err;
|
||||
for (i = 0; i < locks; i++)
|
||||
spin_lock_init(&t->lock[i].l);
|
||||
}
|
||||
}
|
||||
|
||||
t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL);
|
||||
if (!t->buckets)
|
||||
goto err;
|
||||
for (i = 0; i < buckets; i++)
|
||||
INIT_HLIST_BL_HEAD(&t->buckets[i]);
|
||||
t->mask = buckets - 1;
|
||||
t->size = buckets;
|
||||
t->seqc_mask = scounts - 1;
|
||||
t->lock_mask = locks - 1;
|
||||
t->u_thresh = buckets;
|
||||
t->l_thresh = buckets >> 4;
|
||||
t->bits = order_base_2(buckets);
|
||||
/* new_tbl points to self if no new table is filled */
|
||||
RCU_INIT_POINTER(t->new_tbl, t);
|
||||
get_random_bytes(&t->hash_key, sizeof(t->hash_key));
|
||||
return t;
|
||||
|
||||
err:
|
||||
ip_vs_rht_free(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get the desired table size for n entries based on current table size and
|
||||
* by using the formula size = n / (2^lfactor)
|
||||
* lfactor: shift value for the load factor:
|
||||
* - >0: u_thresh=size << lfactor, for load factor above 100%
|
||||
* - <0: u_thresh=size >> -lfactor, for load factor below 100%
|
||||
* - 0: for load factor of 100%
|
||||
*/
|
||||
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
|
||||
int lfactor, int min_bits, int max_bits)
|
||||
{
|
||||
if (!t)
|
||||
return 1 << min_bits;
|
||||
n = roundup_pow_of_two(n);
|
||||
if (lfactor < 0) {
|
||||
int factor = min(-lfactor, max_bits);
|
||||
|
||||
n = min(n, 1 << (max_bits - factor));
|
||||
n <<= factor;
|
||||
} else {
|
||||
n = min(n >> lfactor, 1 << max_bits);
|
||||
}
|
||||
if (lfactor != t->lfactor)
|
||||
return clamp(n, 1 << min_bits, 1 << max_bits);
|
||||
if (n > t->size)
|
||||
return n;
|
||||
if (n > t->size >> 4)
|
||||
return t->size;
|
||||
/* Shrink but keep it n * 2 to prevent frequent resizing */
|
||||
return clamp(n << 1, 1 << min_bits, 1 << max_bits);
|
||||
}
|
||||
|
||||
/* Set thresholds based on table size and load factor:
|
||||
* u_thresh = size * (2^lfactor)
|
||||
* l_thresh = u_thresh / 16
|
||||
* u_thresh/l_thresh can be used to check if load triggers a table grow/shrink
|
||||
*/
|
||||
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
|
||||
int min_bits, int max_bits)
|
||||
{
|
||||
if (size >= 1 << max_bits)
|
||||
t->u_thresh = INT_MAX; /* stop growing */
|
||||
else if (lfactor <= 0)
|
||||
t->u_thresh = size >> min(-lfactor, max_bits);
|
||||
else
|
||||
t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor;
|
||||
|
||||
/* l_thresh: shrink when load is 16 times lower, can be 0 */
|
||||
if (size >= 1 << max_bits)
|
||||
t->l_thresh = (1 << max_bits) >> 4;
|
||||
else if (size > 1 << min_bits)
|
||||
t->l_thresh = t->u_thresh >> 4;
|
||||
else
|
||||
t->l_thresh = 0; /* stop shrinking */
|
||||
}
|
||||
|
||||
/* Return hash value for local info (fast, insecure) */
|
||||
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
|
||||
const union nf_inet_addr *addr, u32 v1, u32 v2)
|
||||
{
|
||||
u32 v3;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
v3 = ipv6_addr_hash(&addr->in6);
|
||||
else
|
||||
#endif
|
||||
v3 = addr->all[0];
|
||||
|
||||
return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]);
|
||||
}
|
||||
|
||||
static inline void
|
||||
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
|
||||
{
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/nsproxy.h>
|
||||
@@ -293,47 +294,59 @@ ip_vs_use_count_dec(void)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Service hashing:
|
||||
* Operation Locking order
|
||||
* ---------------------------------------------------------------------------
|
||||
* add table service_mutex, svc_resize_sem(W)
|
||||
* del table service_mutex
|
||||
* move between tables svc_resize_sem(W), seqcount_t(W), bit lock
|
||||
* add/del service service_mutex, bit lock
|
||||
* find service RCU, seqcount_t(R)
|
||||
* walk services(blocking) service_mutex, svc_resize_sem(R)
|
||||
* walk services(non-blocking) RCU, seqcount_t(R)
|
||||
*
|
||||
* - new tables are linked/unlinked under service_mutex and svc_resize_sem
|
||||
* - new table is linked on resizing and all operations can run in parallel
|
||||
* in 2 tables until the new table is registered as current one
|
||||
* - two contexts can modify buckets: config and table resize, both in
|
||||
* process context
|
||||
* - only table resizer can move entries, so we do not protect t->seqc[]
|
||||
* items with t->lock[]
|
||||
* - lookups occur under RCU lock and seqcount reader lock to detect if
|
||||
* services are moved to new table
|
||||
* - move operations may disturb readers: find operation will not miss entries
|
||||
* but walkers may see same entry twice if they are forced to retry chains
|
||||
* - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
|
||||
* service_mutex to disallow new tables to be installed or to check
|
||||
* svc_table_changes and repeat the RCU read section if new table is installed
|
||||
*/
|
||||
|
||||
/*
|
||||
* Returns hash value for virtual service
|
||||
*/
|
||||
static inline unsigned int
|
||||
ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
|
||||
static inline u32
|
||||
ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
|
||||
const union nf_inet_addr *addr, __be16 port)
|
||||
{
|
||||
unsigned int porth = ntohs(port);
|
||||
__be32 addr_fold = addr->ip;
|
||||
__u32 ahash;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
addr_fold = addr->ip6[0]^addr->ip6[1]^
|
||||
addr->ip6[2]^addr->ip6[3];
|
||||
#endif
|
||||
ahash = ntohl(addr_fold);
|
||||
ahash ^= ((size_t) ipvs >> 8);
|
||||
|
||||
return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
|
||||
IP_VS_SVC_TAB_MASK;
|
||||
return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns hash value of fwmark for virtual service lookup
|
||||
*/
|
||||
static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
|
||||
static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
|
||||
__u32 fwmark)
|
||||
{
|
||||
return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
|
||||
return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hashes a service in the svc_table by <netns,proto,addr,port>
|
||||
* or by fwmark.
|
||||
* Should be called with locked tables.
|
||||
*/
|
||||
/* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
|
||||
static int ip_vs_svc_hash(struct ip_vs_service *svc)
|
||||
{
|
||||
unsigned int hash;
|
||||
struct netns_ipvs *ipvs = svc->ipvs;
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_rht *t;
|
||||
u32 hash;
|
||||
|
||||
if (svc->flags & IP_VS_SVC_F_HASHED) {
|
||||
pr_err("%s(): request for already hashed, called from %pS\n",
|
||||
@@ -341,23 +354,32 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* increase its refcnt because it is referenced by the svc table */
|
||||
atomic_inc(&svc->refcnt);
|
||||
|
||||
/* New entries go into recent table */
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
t = rcu_dereference_protected(t->new_tbl, 1);
|
||||
|
||||
if (svc->fwmark == 0) {
|
||||
/*
|
||||
* Hash it by <netns,protocol,addr,port>
|
||||
* Hash it by <protocol,addr,port>
|
||||
*/
|
||||
hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
|
||||
hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
|
||||
&svc->addr, svc->port);
|
||||
} else {
|
||||
/*
|
||||
* Hash it by fwmark
|
||||
*/
|
||||
hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
|
||||
hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
|
||||
}
|
||||
hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]);
|
||||
|
||||
head = t->buckets + (hash & t->mask);
|
||||
hlist_bl_lock(head);
|
||||
WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
|
||||
svc->flags |= IP_VS_SVC_F_HASHED;
|
||||
/* increase its refcnt because it is referenced by the svc table */
|
||||
atomic_inc(&svc->refcnt);
|
||||
hlist_bl_add_head_rcu(&svc->s_list, head);
|
||||
hlist_bl_unlock(head);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -368,17 +390,45 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
|
||||
*/
|
||||
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
|
||||
{
|
||||
struct netns_ipvs *ipvs = svc->ipvs;
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_rht *t;
|
||||
u32 hash_key2;
|
||||
u32 hash_key;
|
||||
|
||||
if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
|
||||
pr_err("%s(): request for unhash flagged, called from %pS\n",
|
||||
__func__, __builtin_return_address(0));
|
||||
return 0;
|
||||
}
|
||||
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
hash_key = READ_ONCE(svc->hash_key);
|
||||
/* We need to lock the bucket in the right table */
|
||||
if (ip_vs_rht_same_table(t, hash_key)) {
|
||||
head = t->buckets + (hash_key & t->mask);
|
||||
hlist_bl_lock(head);
|
||||
/* Ensure hash_key is read under lock */
|
||||
hash_key2 = READ_ONCE(svc->hash_key);
|
||||
/* Moved to new table ? */
|
||||
if (hash_key != hash_key2) {
|
||||
hlist_bl_unlock(head);
|
||||
t = rcu_dereference_protected(t->new_tbl, 1);
|
||||
head = t->buckets + (hash_key2 & t->mask);
|
||||
hlist_bl_lock(head);
|
||||
}
|
||||
} else {
|
||||
/* It is already moved to new table */
|
||||
t = rcu_dereference_protected(t->new_tbl, 1);
|
||||
head = t->buckets + (hash_key & t->mask);
|
||||
hlist_bl_lock(head);
|
||||
}
|
||||
/* Remove it from svc_table */
|
||||
hlist_del_rcu(&svc->s_list);
|
||||
hlist_bl_del_rcu(&svc->s_list);
|
||||
|
||||
svc->flags &= ~IP_VS_SVC_F_HASHED;
|
||||
atomic_dec(&svc->refcnt);
|
||||
hlist_bl_unlock(head);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -390,18 +440,29 @@ static inline struct ip_vs_service *
|
||||
__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
|
||||
const union nf_inet_addr *vaddr, __be16 vport)
|
||||
{
|
||||
unsigned int hash;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct ip_vs_rht *t, *p;
|
||||
struct hlist_bl_node *e;
|
||||
u32 hash, hash_key;
|
||||
|
||||
/* Check for "full" addressed entries */
|
||||
hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
|
||||
ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
|
||||
/* Check for "full" addressed entries */
|
||||
hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
|
||||
|
||||
hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
|
||||
if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) &&
|
||||
svc->port == vport && svc->protocol == protocol &&
|
||||
!svc->fwmark) {
|
||||
/* HIT */
|
||||
return svc;
|
||||
hash_key = ip_vs_rht_build_hash_key(t, hash);
|
||||
ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
|
||||
if (READ_ONCE(svc->hash_key) == hash_key &&
|
||||
svc->af == af &&
|
||||
ip_vs_addr_equal(af, &svc->addr, vaddr) &&
|
||||
svc->port == vport &&
|
||||
svc->protocol == protocol && !svc->fwmark) {
|
||||
/* HIT */
|
||||
return svc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,16 +476,26 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
|
||||
static inline struct ip_vs_service *
|
||||
__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
|
||||
{
|
||||
unsigned int hash;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct ip_vs_rht *t, *p;
|
||||
struct hlist_bl_node *e;
|
||||
u32 hash, hash_key;
|
||||
|
||||
/* Check for fwmark addressed entries */
|
||||
hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
|
||||
ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
|
||||
/* Check for fwmark addressed entries */
|
||||
hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
|
||||
|
||||
hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) {
|
||||
if (svc->fwmark == fwmark && svc->af == af) {
|
||||
/* HIT */
|
||||
return svc;
|
||||
hash_key = ip_vs_rht_build_hash_key(t, hash);
|
||||
ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
|
||||
if (READ_ONCE(svc->hash_key) == hash_key &&
|
||||
svc->fwmark == fwmark && svc->af == af) {
|
||||
/* HIT */
|
||||
return svc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -487,6 +558,220 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
|
||||
return svc;
|
||||
}
|
||||
|
||||
/* Return the number of registered services */
|
||||
static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
|
||||
{
|
||||
int ns = 0, ni = IP_VS_AF_MAX;
|
||||
|
||||
while (--ni >= 0)
|
||||
ns += atomic_read(&ipvs->num_services[ni]);
|
||||
return ns;
|
||||
}
|
||||
|
||||
/* Get default load factor to map num_services/u_thresh to t->size */
|
||||
static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
|
||||
{
|
||||
int factor;
|
||||
|
||||
if (net_eq(ipvs->net, &init_net))
|
||||
factor = -3; /* grow if load is above 12.5% */
|
||||
else
|
||||
factor = -2; /* grow if load is above 25% */
|
||||
return factor;
|
||||
}
|
||||
|
||||
/* Get the desired svc_table size */
|
||||
static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
|
||||
int lfactor)
|
||||
{
|
||||
return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
|
||||
lfactor, IP_VS_SVC_TAB_MIN_BITS,
|
||||
IP_VS_SVC_TAB_MAX_BITS);
|
||||
}
|
||||
|
||||
/* Allocate svc_table */
|
||||
static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
|
||||
int buckets, int lfactor)
|
||||
{
|
||||
struct ip_vs_rht *t;
|
||||
int scounts, locks;
|
||||
|
||||
/* No frequent lookups to race with resizing, so use max of 64
|
||||
* seqcounts. Only resizer moves entries, so use 0 locks.
|
||||
*/
|
||||
scounts = clamp(buckets >> 4, 1, 64);
|
||||
locks = 0;
|
||||
|
||||
t = ip_vs_rht_alloc(buckets, scounts, locks);
|
||||
if (!t)
|
||||
return NULL;
|
||||
t->lfactor = lfactor;
|
||||
ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
|
||||
IP_VS_SVC_TAB_MAX_BITS);
|
||||
return t;
|
||||
}
|
||||
|
||||
/* svc_table resizer work */
|
||||
static void svc_resize_work_handler(struct work_struct *work)
|
||||
{
|
||||
struct hlist_bl_head *head, *head2;
|
||||
struct ip_vs_rht *t_free = NULL;
|
||||
unsigned int resched_score = 0;
|
||||
struct hlist_bl_node *cn, *nn;
|
||||
struct ip_vs_rht *t, *t_new;
|
||||
struct ip_vs_service *svc;
|
||||
struct netns_ipvs *ipvs;
|
||||
bool more_work = true;
|
||||
seqcount_t *sc;
|
||||
int limit = 0;
|
||||
int new_size;
|
||||
int lfactor;
|
||||
u32 bucket;
|
||||
|
||||
ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
|
||||
|
||||
if (!down_write_trylock(&ipvs->svc_resize_sem))
|
||||
goto out;
|
||||
if (!mutex_trylock(&ipvs->service_mutex))
|
||||
goto unlock_sem;
|
||||
more_work = false;
|
||||
clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
|
||||
if (!READ_ONCE(ipvs->enable) ||
|
||||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
|
||||
goto unlock_m;
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
/* Do nothing if table is removed */
|
||||
if (!t)
|
||||
goto unlock_m;
|
||||
/* New table needs to be registered? BUG! */
|
||||
if (t != rcu_dereference_protected(t->new_tbl, 1))
|
||||
goto unlock_m;
|
||||
|
||||
lfactor = sysctl_svc_lfactor(ipvs);
|
||||
/* Should we resize ? */
|
||||
new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
|
||||
if (new_size == t->size && lfactor == t->lfactor)
|
||||
goto unlock_m;
|
||||
|
||||
t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
|
||||
if (!t_new) {
|
||||
more_work = true;
|
||||
goto unlock_m;
|
||||
}
|
||||
/* Flip the table_id */
|
||||
t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
|
||||
|
||||
rcu_assign_pointer(t->new_tbl, t_new);
|
||||
/* Allow add/del to new_tbl while moving from old table */
|
||||
mutex_unlock(&ipvs->service_mutex);
|
||||
|
||||
ip_vs_rht_for_each_bucket(t, bucket, head) {
|
||||
same_bucket:
|
||||
if (++limit >= 16) {
|
||||
if (!READ_ONCE(ipvs->enable) ||
|
||||
test_bit(IP_VS_WORK_SVC_NORESIZE,
|
||||
&ipvs->work_flags))
|
||||
goto unlock_sem;
|
||||
if (resched_score >= 100) {
|
||||
resched_score = 0;
|
||||
cond_resched();
|
||||
}
|
||||
limit = 0;
|
||||
}
|
||||
if (hlist_bl_empty(head)) {
|
||||
resched_score++;
|
||||
continue;
|
||||
}
|
||||
/* Preemption calls ahead... */
|
||||
resched_score = 0;
|
||||
|
||||
sc = &t->seqc[bucket & t->seqc_mask];
|
||||
/* seqcount_t usage considering PREEMPT_RT rules:
|
||||
* - we are the only writer => preemption can be allowed
|
||||
* - readers (SoftIRQ) => disable BHs
|
||||
* - readers (processes) => preemption should be disabled
|
||||
*/
|
||||
local_bh_disable();
|
||||
preempt_disable_nested();
|
||||
write_seqcount_begin(sc);
|
||||
hlist_bl_lock(head);
|
||||
|
||||
hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
|
||||
u32 hash;
|
||||
|
||||
/* New hash for the new table */
|
||||
if (svc->fwmark == 0) {
|
||||
/* Hash it by <protocol,addr,port> */
|
||||
hash = ip_vs_svc_hashval(t_new, svc->af,
|
||||
svc->protocol,
|
||||
&svc->addr, svc->port);
|
||||
} else {
|
||||
/* Hash it by fwmark */
|
||||
hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
|
||||
svc->fwmark);
|
||||
}
|
||||
hlist_bl_del_rcu(&svc->s_list);
|
||||
head2 = t_new->buckets + (hash & t_new->mask);
|
||||
|
||||
hlist_bl_lock(head2);
|
||||
WRITE_ONCE(svc->hash_key,
|
||||
ip_vs_rht_build_hash_key(t_new, hash));
|
||||
/* t_new->seqc are not used at this stage, we race
|
||||
* only with add/del, so only lock the bucket.
|
||||
*/
|
||||
hlist_bl_add_head_rcu(&svc->s_list, head2);
|
||||
hlist_bl_unlock(head2);
|
||||
/* Too long chain? Do it in steps */
|
||||
if (++limit >= 64)
|
||||
break;
|
||||
}
|
||||
|
||||
hlist_bl_unlock(head);
|
||||
write_seqcount_end(sc);
|
||||
preempt_enable_nested();
|
||||
local_bh_enable();
|
||||
if (limit >= 64)
|
||||
goto same_bucket;
|
||||
}
|
||||
|
||||
/* Tables can be switched only under service_mutex */
|
||||
while (!mutex_trylock(&ipvs->service_mutex)) {
|
||||
cond_resched();
|
||||
if (!READ_ONCE(ipvs->enable) ||
|
||||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
|
||||
goto unlock_sem;
|
||||
}
|
||||
if (!READ_ONCE(ipvs->enable) ||
|
||||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
|
||||
goto unlock_m;
|
||||
|
||||
rcu_assign_pointer(ipvs->svc_table, t_new);
|
||||
/* Inform readers that new table is installed */
|
||||
smp_mb__before_atomic();
|
||||
atomic_inc(&ipvs->svc_table_changes);
|
||||
t_free = t;
|
||||
|
||||
unlock_m:
|
||||
mutex_unlock(&ipvs->service_mutex);
|
||||
|
||||
unlock_sem:
|
||||
up_write(&ipvs->svc_resize_sem);
|
||||
|
||||
if (t_free) {
|
||||
/* RCU readers should not see more than two tables in chain.
|
||||
* To prevent new table to be attached wait here instead of
|
||||
* freeing the old table in RCU callback.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
ip_vs_rht_free(t_free);
|
||||
}
|
||||
|
||||
out:
|
||||
if (!READ_ONCE(ipvs->enable) || !more_work ||
|
||||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
|
||||
return;
|
||||
queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
|
||||
}
|
||||
|
||||
static inline void
|
||||
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
|
||||
@@ -1357,12 +1642,14 @@ static int
|
||||
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
|
||||
struct ip_vs_service **svc_p)
|
||||
{
|
||||
int ret = 0;
|
||||
struct ip_vs_scheduler *sched = NULL;
|
||||
struct ip_vs_rht *tc_new = NULL;
|
||||
struct ip_vs_rht *t, *t_new = NULL;
|
||||
int af_id = ip_vs_af_index(u->af);
|
||||
struct ip_vs_pe *pe = NULL;
|
||||
struct ip_vs_service *svc = NULL;
|
||||
struct ip_vs_pe *pe = NULL;
|
||||
int ret_hooks = -1;
|
||||
int ret = 0;
|
||||
|
||||
/* increase the module use count */
|
||||
if (!ip_vs_use_count_inc())
|
||||
@@ -1404,6 +1691,29 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
|
||||
}
|
||||
#endif
|
||||
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
if (!t) {
|
||||
int lfactor = sysctl_svc_lfactor(ipvs);
|
||||
int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
|
||||
|
||||
t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
|
||||
if (!t_new) {
|
||||
ret = -ENOMEM;
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
|
||||
int lfactor = sysctl_conn_lfactor(ipvs);
|
||||
int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
|
||||
|
||||
tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
|
||||
if (!tc_new) {
|
||||
ret = -ENOMEM;
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
if (!atomic_read(&ipvs->num_services[af_id])) {
|
||||
ret = ip_vs_register_hooks(ipvs, u->af);
|
||||
if (ret < 0)
|
||||
@@ -1449,6 +1759,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
|
||||
if (t_new) {
|
||||
clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
|
||||
rcu_assign_pointer(ipvs->svc_table, t_new);
|
||||
t_new = NULL;
|
||||
}
|
||||
if (tc_new) {
|
||||
rcu_assign_pointer(ipvs->conn_tab, tc_new);
|
||||
tc_new = NULL;
|
||||
}
|
||||
|
||||
/* Update the virtual service counters */
|
||||
if (svc->port == FTPPORT)
|
||||
atomic_inc(&ipvs->ftpsvc_counter[af_id]);
|
||||
@@ -1470,6 +1790,12 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
|
||||
/* Hash the service into the service table */
|
||||
ip_vs_svc_hash(svc);
|
||||
|
||||
/* Schedule resize work */
|
||||
if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
|
||||
!test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
|
||||
queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
|
||||
1);
|
||||
|
||||
*svc_p = svc;
|
||||
|
||||
if (!READ_ONCE(ipvs->enable)) {
|
||||
@@ -1484,6 +1810,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
|
||||
|
||||
|
||||
out_err:
|
||||
if (tc_new)
|
||||
ip_vs_rht_free(tc_new);
|
||||
if (t_new)
|
||||
ip_vs_rht_free(t_new);
|
||||
if (ret_hooks >= 0)
|
||||
ip_vs_unregister_hooks(ipvs, u->af);
|
||||
if (svc != NULL) {
|
||||
@@ -1671,10 +2001,38 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
|
||||
*/
|
||||
static int ip_vs_del_service(struct ip_vs_service *svc)
|
||||
{
|
||||
struct netns_ipvs *ipvs;
|
||||
struct ip_vs_rht *t, *p;
|
||||
int ns;
|
||||
|
||||
if (svc == NULL)
|
||||
return -EEXIST;
|
||||
ipvs = svc->ipvs;
|
||||
ip_vs_unlink_service(svc, false);
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
|
||||
/* Drop the table if no more services */
|
||||
ns = ip_vs_get_num_services(ipvs);
|
||||
if (!ns) {
|
||||
/* Stop the resizer and drop the tables */
|
||||
set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
|
||||
cancel_delayed_work_sync(&ipvs->svc_resize_work);
|
||||
if (t) {
|
||||
rcu_assign_pointer(ipvs->svc_table, NULL);
|
||||
while (1) {
|
||||
p = rcu_dereference_protected(t->new_tbl, 1);
|
||||
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
|
||||
if (p == t)
|
||||
break;
|
||||
t = p;
|
||||
}
|
||||
}
|
||||
} else if (ns <= t->l_thresh &&
|
||||
!test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
|
||||
&ipvs->work_flags)) {
|
||||
queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
|
||||
1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1684,14 +2042,36 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
|
||||
*/
|
||||
static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
|
||||
{
|
||||
int idx;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKETS();
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_node *n;
|
||||
struct hlist_bl_node *ne;
|
||||
struct hlist_bl_node *e;
|
||||
struct ip_vs_rht *t, *p;
|
||||
|
||||
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
|
||||
hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx],
|
||||
s_list)
|
||||
ip_vs_unlink_service(svc, cleanup);
|
||||
/* Stop the resizer and drop the tables */
|
||||
if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
|
||||
cancel_delayed_work_sync(&ipvs->svc_resize_work);
|
||||
/* No resizer, so now we have exclusive write access */
|
||||
|
||||
if (ip_vs_get_num_services(ipvs)) {
|
||||
ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
|
||||
hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
|
||||
ip_vs_unlink_service(svc, cleanup);
|
||||
}
|
||||
}
|
||||
|
||||
/* Unregister the hash table and release it after RCU grace period */
|
||||
t = rcu_dereference_protected(ipvs->svc_table, 1);
|
||||
if (t) {
|
||||
rcu_assign_pointer(ipvs->svc_table, NULL);
|
||||
while (1) {
|
||||
p = rcu_dereference_protected(t->new_tbl, 1);
|
||||
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
|
||||
if (p == t)
|
||||
break;
|
||||
t = p;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -1742,19 +2122,44 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct net *net = dev_net(dev);
|
||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
|
||||
unsigned int resched_score = 0;
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_bl_node *e;
|
||||
struct ip_vs_dest *dest;
|
||||
unsigned int idx;
|
||||
int old_gen, new_gen;
|
||||
|
||||
if (event != NETDEV_DOWN || !ipvs)
|
||||
return NOTIFY_DONE;
|
||||
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
|
||||
|
||||
old_gen = atomic_read(&ipvs->svc_table_changes);
|
||||
|
||||
rcu_read_lock();
|
||||
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
|
||||
hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list)
|
||||
|
||||
repeat:
|
||||
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
|
||||
ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
|
||||
list_for_each_entry_rcu(dest, &svc->destinations,
|
||||
n_list)
|
||||
n_list) {
|
||||
ip_vs_forget_dev(dest, dev);
|
||||
resched_score += 10;
|
||||
}
|
||||
resched_score++;
|
||||
}
|
||||
resched_score++;
|
||||
if (resched_score >= 100) {
|
||||
resched_score = 0;
|
||||
cond_resched_rcu();
|
||||
new_gen = atomic_read(&ipvs->svc_table_changes);
|
||||
/* New table installed ? */
|
||||
if (old_gen != new_gen) {
|
||||
old_gen = new_gen;
|
||||
goto repeat;
|
||||
}
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@@ -1777,14 +2182,28 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
|
||||
|
||||
static int ip_vs_zero_all(struct netns_ipvs *ipvs)
|
||||
{
|
||||
int idx;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
|
||||
unsigned int resched_score = 0;
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_bl_node *e;
|
||||
|
||||
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
|
||||
hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list)
|
||||
rcu_read_lock();
|
||||
|
||||
ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
|
||||
ip_vs_zero_service(svc);
|
||||
resched_score += 10;
|
||||
}
|
||||
resched_score++;
|
||||
if (resched_score >= 100) {
|
||||
resched_score = 0;
|
||||
cond_resched_rcu();
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
ip_vs_zero_stats(&ipvs->tot_stats->s);
|
||||
return 0;
|
||||
}
|
||||
@@ -2218,7 +2637,8 @@ static struct ctl_table vs_vars[] = {
|
||||
|
||||
struct ip_vs_iter {
|
||||
struct seq_net_private p; /* Do not move this, netns depends upon it*/
|
||||
int bucket;
|
||||
struct ip_vs_rht *t;
|
||||
u32 bucket;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -2239,17 +2659,23 @@ static inline const char *ip_vs_fwd_name(unsigned int flags)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Do not expect consistent view during add, del and move(table resize).
|
||||
* We may miss entries and even show duplicates.
|
||||
*/
|
||||
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||
struct ip_vs_iter *iter = seq->private;
|
||||
int idx;
|
||||
struct ip_vs_rht *t = iter->t;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_bl_node *e;
|
||||
int idx;
|
||||
|
||||
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
|
||||
hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) {
|
||||
if (!t)
|
||||
return NULL;
|
||||
for (idx = 0; idx < t->size; idx++) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
|
||||
if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
|
||||
break;
|
||||
if (pos-- == 0) {
|
||||
iter->bucket = idx;
|
||||
return svc;
|
||||
@@ -2262,18 +2688,22 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
|
||||
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
__acquires(RCU)
|
||||
{
|
||||
struct ip_vs_iter *iter = seq->private;
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||
|
||||
rcu_read_lock();
|
||||
iter->t = rcu_dereference(ipvs->svc_table);
|
||||
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
|
||||
}
|
||||
|
||||
|
||||
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct hlist_node *e;
|
||||
struct ip_vs_iter *iter;
|
||||
struct ip_vs_service *svc;
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||
struct ip_vs_iter *iter;
|
||||
struct hlist_bl_node *e;
|
||||
struct ip_vs_rht *t;
|
||||
|
||||
++*pos;
|
||||
if (v == SEQ_START_TOKEN)
|
||||
@@ -2281,15 +2711,22 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
|
||||
svc = v;
|
||||
iter = seq->private;
|
||||
t = iter->t;
|
||||
if (!t)
|
||||
return NULL;
|
||||
|
||||
e = rcu_dereference(hlist_next_rcu(&svc->s_list));
|
||||
if (e)
|
||||
return hlist_entry(e, struct ip_vs_service, s_list);
|
||||
hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
|
||||
/* Our cursor was moved to new table ? */
|
||||
if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
|
||||
break;
|
||||
return svc;
|
||||
}
|
||||
|
||||
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
|
||||
hlist_for_each_entry_rcu(svc,
|
||||
&ipvs->svc_table[iter->bucket],
|
||||
s_list) {
|
||||
while (++iter->bucket < t->size) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
|
||||
s_list) {
|
||||
if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
|
||||
break;
|
||||
return svc;
|
||||
}
|
||||
}
|
||||
@@ -2770,13 +3207,18 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
|
||||
const struct ip_vs_get_services *get,
|
||||
struct ip_vs_get_services __user *uptr)
|
||||
{
|
||||
int idx, count=0;
|
||||
struct ip_vs_service *svc;
|
||||
struct ip_vs_service_entry entry;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKETS();
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_bl_node *e;
|
||||
int count = 0;
|
||||
int ret = 0;
|
||||
|
||||
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
|
||||
hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) {
|
||||
lockdep_assert_held(&ipvs->svc_resize_sem);
|
||||
/* All service modifications are disabled, go ahead */
|
||||
ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
|
||||
hlist_bl_for_each_entry(svc, e, head, s_list) {
|
||||
/* Only expose IPv4 entries to old interface */
|
||||
if (svc->af != AF_INET)
|
||||
continue;
|
||||
@@ -2948,6 +3390,35 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (cmd == IP_VS_SO_GET_SERVICES) {
|
||||
struct ip_vs_get_services *get;
|
||||
size_t size;
|
||||
|
||||
get = (struct ip_vs_get_services *)arg;
|
||||
size = struct_size(get, entrytable, get->num_services);
|
||||
if (*len != size) {
|
||||
pr_err("length: %u != %zu\n", *len, size);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* Protect against table resizer moving the entries.
|
||||
* Try reverse locking, so that we do not hold the mutex
|
||||
* while waiting for semaphore.
|
||||
*/
|
||||
while (1) {
|
||||
ret = down_read_killable(&ipvs->svc_resize_sem);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (mutex_trylock(&ipvs->service_mutex))
|
||||
break;
|
||||
up_read(&ipvs->svc_resize_sem);
|
||||
cond_resched();
|
||||
}
|
||||
ret = __ip_vs_get_service_entries(ipvs, get, user);
|
||||
up_read(&ipvs->svc_resize_sem);
|
||||
mutex_unlock(&ipvs->service_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&ipvs->service_mutex);
|
||||
switch (cmd) {
|
||||
case IP_VS_SO_GET_VERSION:
|
||||
@@ -2976,22 +3447,6 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
|
||||
}
|
||||
break;
|
||||
|
||||
case IP_VS_SO_GET_SERVICES:
|
||||
{
|
||||
struct ip_vs_get_services *get;
|
||||
size_t size;
|
||||
|
||||
get = (struct ip_vs_get_services *)arg;
|
||||
size = struct_size(get, entrytable, get->num_services);
|
||||
if (*len != size) {
|
||||
pr_err("length: %u != %zu\n", *len, size);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
ret = __ip_vs_get_service_entries(ipvs, get, user);
|
||||
}
|
||||
break;
|
||||
|
||||
case IP_VS_SO_GET_SERVICE:
|
||||
{
|
||||
struct ip_vs_service_entry *entry;
|
||||
@@ -3277,15 +3732,19 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,
|
||||
static int ip_vs_genl_dump_services(struct sk_buff *skb,
|
||||
struct netlink_callback *cb)
|
||||
{
|
||||
int idx = 0, i;
|
||||
int start = cb->args[0];
|
||||
struct ip_vs_service *svc;
|
||||
DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
|
||||
struct net *net = sock_net(skb->sk);
|
||||
struct netns_ipvs *ipvs = net_ipvs(net);
|
||||
struct hlist_bl_head *head;
|
||||
struct ip_vs_service *svc;
|
||||
struct hlist_bl_node *e;
|
||||
int start = cb->args[0];
|
||||
int idx = 0;
|
||||
|
||||
down_read(&ipvs->svc_resize_sem);
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
|
||||
hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) {
|
||||
ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
|
||||
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
|
||||
if (++idx <= start)
|
||||
continue;
|
||||
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
|
||||
@@ -3297,6 +3756,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
|
||||
|
||||
nla_put_failure:
|
||||
rcu_read_unlock();
|
||||
up_read(&ipvs->svc_resize_sem);
|
||||
cb->args[0] = idx;
|
||||
|
||||
return skb->len;
|
||||
@@ -4306,8 +4766,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
|
||||
|
||||
/* Initialize service_mutex, svc_table per netns */
|
||||
__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
|
||||
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)
|
||||
INIT_HLIST_HEAD(&ipvs->svc_table[idx]);
|
||||
init_rwsem(&ipvs->svc_resize_sem);
|
||||
INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
|
||||
atomic_set(&ipvs->svc_table_changes, 0);
|
||||
RCU_INIT_POINTER(ipvs->svc_table, NULL);
|
||||
|
||||
/* Initialize rs_table */
|
||||
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
|
||||
@@ -4326,6 +4788,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
|
||||
}
|
||||
|
||||
INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
|
||||
ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
|
||||
|
||||
/* procfs stats */
|
||||
ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
|
||||
|
||||
@@ -132,9 +132,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
|
||||
}
|
||||
|
||||
static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
|
||||
u32 initval, bool inverse)
|
||||
struct ip_vs_rht *t, bool inverse)
|
||||
{
|
||||
return jhash(p->pe_data, p->pe_data_len, initval);
|
||||
return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]);
|
||||
}
|
||||
|
||||
static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
|
||||
|
||||
@@ -1755,6 +1755,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
|
||||
if (!ip_vs_use_count_inc())
|
||||
return -ENOPROTOOPT;
|
||||
|
||||
/* Backup server can be started without services just to sync conns,
|
||||
* make sure conn_tab is created even if ipvs->enable is 0.
|
||||
*/
|
||||
if (state == IP_VS_STATE_BACKUP) {
|
||||
mutex_lock(&ipvs->service_mutex);
|
||||
if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
|
||||
int lfactor = sysctl_conn_lfactor(ipvs);
|
||||
int new_size = ip_vs_conn_desired_size(ipvs, NULL,
|
||||
lfactor);
|
||||
struct ip_vs_rht *tc_new;
|
||||
|
||||
tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
|
||||
if (!tc_new) {
|
||||
mutex_unlock(&ipvs->service_mutex);
|
||||
result = -ENOMEM;
|
||||
goto out_module;
|
||||
}
|
||||
rcu_assign_pointer(ipvs->conn_tab, tc_new);
|
||||
}
|
||||
mutex_unlock(&ipvs->service_mutex);
|
||||
}
|
||||
|
||||
/* Do not hold one mutex and then to block on another */
|
||||
for (;;) {
|
||||
rtnl_lock();
|
||||
@@ -1922,6 +1944,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
|
||||
mutex_unlock(&ipvs->sync_mutex);
|
||||
rtnl_unlock();
|
||||
|
||||
out_module:
|
||||
/* decrease the module use count */
|
||||
ip_vs_use_count_dec();
|
||||
return result;
|
||||
|
||||
@@ -165,18 +165,26 @@ static struct nf_logger nf_arp_logger __read_mostly = {
|
||||
static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
|
||||
struct sock *sk)
|
||||
{
|
||||
const struct socket *sock;
|
||||
const struct file *file;
|
||||
|
||||
if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
|
||||
return;
|
||||
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
if (sk->sk_socket && sk->sk_socket->file) {
|
||||
const struct cred *cred = sk->sk_socket->file->f_cred;
|
||||
/* The sk pointer remains valid as long as the skb is. The sk_socket and
|
||||
* file pointer may become NULL if the socket is closed. Both structures
|
||||
* (including file->cred) are RCU freed which means they can be accessed
|
||||
* within a RCU read section.
|
||||
*/
|
||||
sock = READ_ONCE(sk->sk_socket);
|
||||
file = sock ? READ_ONCE(sock->file) : NULL;
|
||||
if (file) {
|
||||
const struct cred *cred = file->f_cred;
|
||||
|
||||
nf_log_buf_add(m, "UID=%u GID=%u ",
|
||||
from_kuid_munged(&init_user_ns, cred->fsuid),
|
||||
from_kgid_munged(&init_user_ns, cred->fsgid));
|
||||
}
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
static noinline_for_stack int
|
||||
|
||||
@@ -611,19 +611,26 @@ __build_packet_message(struct nfnl_log_net *log,
|
||||
/* UID */
|
||||
sk = skb->sk;
|
||||
if (sk && sk_fullsock(sk)) {
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
if (sk->sk_socket && sk->sk_socket->file) {
|
||||
struct file *file = sk->sk_socket->file;
|
||||
const struct socket *sock;
|
||||
const struct file *file;
|
||||
|
||||
/* The sk pointer remains valid as long as the skb is.
|
||||
* The sk_socket and file pointer may become NULL
|
||||
* if the socket is closed.
|
||||
* Both structures (including file->cred) are RCU freed
|
||||
* which means they can be accessed within a RCU read section.
|
||||
*/
|
||||
sock = READ_ONCE(sk->sk_socket);
|
||||
file = sock ? READ_ONCE(sock->file) : NULL;
|
||||
if (file) {
|
||||
const struct cred *cred = file->f_cred;
|
||||
struct user_namespace *user_ns = inst->peer_user_ns;
|
||||
__be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
|
||||
__be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
|
||||
nla_put_be32(inst->skb, NFULA_GID, gid))
|
||||
goto nla_put_failure;
|
||||
} else
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/* local sequence number */
|
||||
|
||||
@@ -545,14 +545,23 @@ nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
|
||||
|
||||
static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
|
||||
{
|
||||
const struct socket *sock;
|
||||
const struct file *file;
|
||||
const struct cred *cred;
|
||||
|
||||
if (!sk_fullsock(sk))
|
||||
return 0;
|
||||
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
if (sk->sk_socket && sk->sk_socket->file) {
|
||||
cred = sk->sk_socket->file->f_cred;
|
||||
/* The sk pointer remains valid as long as the skb is.
|
||||
* The sk_socket and file pointer may become NULL
|
||||
* if the socket is closed.
|
||||
* Both structures (including file->cred) are RCU freed
|
||||
* which means they can be accessed within a RCU read section.
|
||||
*/
|
||||
sock = READ_ONCE(sk->sk_socket);
|
||||
file = sock ? READ_ONCE(sock->file) : NULL;
|
||||
if (file) {
|
||||
cred = file->f_cred;
|
||||
if (nla_put_be32(skb, NFQA_UID,
|
||||
htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
|
||||
goto nla_put_failure;
|
||||
@@ -560,11 +569,9 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
|
||||
htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -585,15 +592,8 @@ static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
|
||||
{
|
||||
int seclen = 0;
|
||||
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
|
||||
|
||||
if (!skb || !sk_fullsock(skb->sk))
|
||||
return 0;
|
||||
|
||||
read_lock_bh(&skb->sk->sk_callback_lock);
|
||||
|
||||
if (skb->secmark)
|
||||
seclen = security_secid_to_secctx(skb->secmark, ctx);
|
||||
read_unlock_bh(&skb->sk->sk_callback_lock);
|
||||
#endif
|
||||
return seclen;
|
||||
}
|
||||
|
||||
@@ -131,33 +131,36 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key,
|
||||
u32 *dest,
|
||||
const struct nft_pktinfo *pkt)
|
||||
{
|
||||
struct sock *sk = skb_to_full_sk(pkt->skb);
|
||||
struct socket *sock;
|
||||
const struct sock *sk = skb_to_full_sk(pkt->skb);
|
||||
const struct socket *sock;
|
||||
const struct file *file;
|
||||
|
||||
if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
|
||||
return false;
|
||||
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
sock = sk->sk_socket;
|
||||
if (!sock || !sock->file) {
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
/* The sk pointer remains valid as long as the skb is. The sk_socket and
|
||||
* file pointer may become NULL if the socket is closed. Both structures
|
||||
* (including file->cred) are RCU freed which means they can be accessed
|
||||
* within a RCU read section.
|
||||
*/
|
||||
sock = READ_ONCE(sk->sk_socket);
|
||||
file = sock ? READ_ONCE(sock->file) : NULL;
|
||||
if (!file)
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (key) {
|
||||
case NFT_META_SKUID:
|
||||
*dest = from_kuid_munged(sock_net(sk)->user_ns,
|
||||
sock->file->f_cred->fsuid);
|
||||
file->f_cred->fsuid);
|
||||
break;
|
||||
case NFT_META_SKGID:
|
||||
*dest = from_kgid_munged(sock_net(sk)->user_ns,
|
||||
sock->file->f_cred->fsgid);
|
||||
file->f_cred->fsgid);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -63,11 +63,12 @@ static bool
|
||||
owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
|
||||
{
|
||||
const struct xt_owner_match_info *info = par->matchinfo;
|
||||
const struct file *filp;
|
||||
struct sock *sk = skb_to_full_sk(skb);
|
||||
struct net *net = xt_net(par);
|
||||
const struct socket *sock;
|
||||
const struct file *filp;
|
||||
|
||||
if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk)))
|
||||
if (!sk || !READ_ONCE(sk->sk_socket) || !net_eq(net, sock_net(sk)))
|
||||
return (info->match ^ info->invert) == 0;
|
||||
else if (info->match & info->invert & XT_OWNER_SOCKET)
|
||||
/*
|
||||
@@ -76,23 +77,25 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
|
||||
*/
|
||||
return false;
|
||||
|
||||
read_lock_bh(&sk->sk_callback_lock);
|
||||
filp = sk->sk_socket ? sk->sk_socket->file : NULL;
|
||||
if (filp == NULL) {
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
/* The sk pointer remains valid as long as the skb is. The sk_socket and
|
||||
* file pointer may become NULL if the socket is closed. Both structures
|
||||
* (including file->cred) are RCU freed which means they can be accessed
|
||||
* within a RCU read section.
|
||||
*/
|
||||
sock = READ_ONCE(sk->sk_socket);
|
||||
filp = sock ? READ_ONCE(sock->file) : NULL;
|
||||
if (filp == NULL)
|
||||
return ((info->match ^ info->invert) &
|
||||
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
|
||||
}
|
||||
|
||||
if (info->match & XT_OWNER_UID) {
|
||||
kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
|
||||
kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
|
||||
|
||||
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
|
||||
uid_lte(filp->f_cred->fsuid, uid_max)) ^
|
||||
!(info->invert & XT_OWNER_UID)) {
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
!(info->invert & XT_OWNER_UID))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (info->match & XT_OWNER_GID) {
|
||||
@@ -117,13 +120,10 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
|
||||
}
|
||||
}
|
||||
|
||||
if (match ^ !(info->invert & XT_OWNER_GID)) {
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
if (match ^ !(info->invert & XT_OWNER_GID))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
read_unlock_bh(&sk->sk_callback_lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user