mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-04 15:55:59 -04:00
net/rds: Encode cp_index in TCP source port
Upon "sendmsg", RDS/TCP selects a backend connection based
on a hash calculated from the source-port ("RDS_MPATH_HASH").
However, "rds_tcp_accept_one" accepts connections
in the order they arrive, which is non-deterministic.
Therefore the mapping of the sender's "cp->cp_index"
to that of the receiver changes if the backend
connections are dropped and reconnected.
However, connection state that's preserved across reconnects
(e.g. "cp_next_rx_seq") relies on that sender<->receiver
mapping to never change.
So we make sure that client and server of the TCP connection
have the exact same "cp->cp_index" across reconnects by
encoding "cp->cp_index" in the lower three bits of the
client's TCP source port.
A new extension "RDS_EXTHDR_SPORT_IDX" is introduced,
that allows the server to tell the difference between
clients that do the "cp->cp_index" encoding, and
legacy clients that pick source ports randomly.
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
Link: https://patch.msgid.link/20260203055723.1085751-3-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
committed by
Jakub Kicinski
parent
46f257ee69
commit
a20a699255
@@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
|
|||||||
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
|
[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
|
||||||
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
|
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
|
||||||
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
|
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
|
||||||
|
[RDS_EXTHDR_SPORT_IDX] = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
void rds_message_addref(struct rds_message *rm)
|
void rds_message_addref(struct rds_message *rm)
|
||||||
|
|||||||
@@ -147,6 +147,7 @@ struct rds_connection {
|
|||||||
c_ping_triggered:1,
|
c_ping_triggered:1,
|
||||||
c_pad_to_32:29;
|
c_pad_to_32:29;
|
||||||
int c_npaths;
|
int c_npaths;
|
||||||
|
bool c_with_sport_idx;
|
||||||
struct rds_connection *c_passive;
|
struct rds_connection *c_passive;
|
||||||
struct rds_transport *c_trans;
|
struct rds_transport *c_trans;
|
||||||
|
|
||||||
@@ -278,8 +279,10 @@ struct rds_ext_header_rdma_bytes {
|
|||||||
*/
|
*/
|
||||||
#define RDS_EXTHDR_NPATHS 5
|
#define RDS_EXTHDR_NPATHS 5
|
||||||
#define RDS_EXTHDR_GEN_NUM 6
|
#define RDS_EXTHDR_GEN_NUM 6
|
||||||
|
#define RDS_EXTHDR_SPORT_IDX 8
|
||||||
|
|
||||||
#define __RDS_EXTHDR_MAX 16 /* for now */
|
#define __RDS_EXTHDR_MAX 16 /* for now */
|
||||||
|
|
||||||
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
|
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
|
||||||
#define RDS_MSG_RX_HDR 0
|
#define RDS_MSG_RX_HDR 0
|
||||||
#define RDS_MSG_RX_START 1
|
#define RDS_MSG_RX_START 1
|
||||||
|
|||||||
@@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
|||||||
struct rds_ext_header_version version;
|
struct rds_ext_header_version version;
|
||||||
__be16 rds_npaths;
|
__be16 rds_npaths;
|
||||||
__be32 rds_gen_num;
|
__be32 rds_gen_num;
|
||||||
|
u8 dummy;
|
||||||
} buffer;
|
} buffer;
|
||||||
|
bool new_with_sport_idx = false;
|
||||||
u32 new_peer_gen_num = 0;
|
u32 new_peer_gen_num = 0;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
@@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
|||||||
case RDS_EXTHDR_GEN_NUM:
|
case RDS_EXTHDR_GEN_NUM:
|
||||||
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
|
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
|
||||||
break;
|
break;
|
||||||
|
case RDS_EXTHDR_SPORT_IDX:
|
||||||
|
new_with_sport_idx = true;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
pr_warn_ratelimited("ignoring unknown exthdr type "
|
pr_warn_ratelimited("ignoring unknown exthdr type "
|
||||||
"0x%x\n", type);
|
"0x%x\n", type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
conn->c_with_sport_idx = new_with_sport_idx;
|
||||||
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
|
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
|
||||||
conn->c_npaths = max_t(int, conn->c_npaths, 1);
|
conn->c_npaths = max_t(int, conn->c_npaths, 1);
|
||||||
conn->c_ping_triggered = 0;
|
conn->c_ping_triggered = 0;
|
||||||
|
|||||||
@@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
|
|||||||
cp->cp_conn->c_trans->t_mp_capable) {
|
cp->cp_conn->c_trans->t_mp_capable) {
|
||||||
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
|
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
|
||||||
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
|
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
|
||||||
|
u8 dummy = 0;
|
||||||
|
|
||||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||||
RDS_EXTHDR_NPATHS, &npaths);
|
RDS_EXTHDR_NPATHS, &npaths);
|
||||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||||
RDS_EXTHDR_GEN_NUM,
|
RDS_EXTHDR_GEN_NUM,
|
||||||
&my_gen_num);
|
&my_gen_num);
|
||||||
|
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||||
|
RDS_EXTHDR_SPORT_IDX,
|
||||||
|
&dummy);
|
||||||
}
|
}
|
||||||
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ struct rds_tcp_connection {
|
|||||||
*/
|
*/
|
||||||
struct mutex t_conn_path_lock;
|
struct mutex t_conn_path_lock;
|
||||||
struct socket *t_sock;
|
struct socket *t_sock;
|
||||||
|
u32 t_client_port_group;
|
||||||
struct rds_tcp_net *t_rtn;
|
struct rds_tcp_net *t_rtn;
|
||||||
void *t_orig_write_space;
|
void *t_orig_write_space;
|
||||||
void *t_orig_data_ready;
|
void *t_orig_data_ready;
|
||||||
|
|||||||
@@ -93,6 +93,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
|
|||||||
struct sockaddr_in6 sin6;
|
struct sockaddr_in6 sin6;
|
||||||
struct sockaddr_in sin;
|
struct sockaddr_in sin;
|
||||||
struct sockaddr *addr;
|
struct sockaddr *addr;
|
||||||
|
int port_low, port_high, port;
|
||||||
|
int port_groups, groups_left;
|
||||||
int addrlen;
|
int addrlen;
|
||||||
bool isv6;
|
bool isv6;
|
||||||
int ret;
|
int ret;
|
||||||
@@ -145,7 +147,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
|
|||||||
addrlen = sizeof(sin);
|
addrlen = sizeof(sin);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
|
/* encode cp->cp_index in lowest bits of source-port */
|
||||||
|
inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
|
||||||
|
port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
|
||||||
|
port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
|
||||||
|
ret = -EADDRINUSE;
|
||||||
|
groups_left = port_groups;
|
||||||
|
while (groups_left-- > 0 && ret) {
|
||||||
|
if (++tc->t_client_port_group >= port_groups)
|
||||||
|
tc->t_client_port_group = 0;
|
||||||
|
port = port_low +
|
||||||
|
tc->t_client_port_group * RDS_MPATH_WORKERS +
|
||||||
|
cp->cp_index;
|
||||||
|
|
||||||
|
if (isv6)
|
||||||
|
sin6.sin6_port = htons(port);
|
||||||
|
else
|
||||||
|
sin.sin_port = htons(port);
|
||||||
|
ret = kernel_bind(sock, (struct sockaddr_unsized *)addr,
|
||||||
|
addrlen);
|
||||||
|
}
|
||||||
if (ret) {
|
if (ret) {
|
||||||
rdsdebug("bind failed with %d at address %pI6c\n",
|
rdsdebug("bind failed with %d at address %pI6c\n",
|
||||||
ret, &conn->c_laddr);
|
ret, &conn->c_laddr);
|
||||||
|
|||||||
@@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock)
|
|||||||
* we special case cp_index 0 is to allow the rds probe ping itself to itself
|
* we special case cp_index 0 is to allow the rds probe ping itself to itself
|
||||||
* get through efficiently.
|
* get through efficiently.
|
||||||
*/
|
*/
|
||||||
static
|
static struct rds_tcp_connection *
|
||||||
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
|
rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
|
||||||
{
|
{
|
||||||
int i;
|
union {
|
||||||
int npaths = max_t(int, 1, conn->c_npaths);
|
struct sockaddr_storage storage;
|
||||||
|
struct sockaddr addr;
|
||||||
|
struct sockaddr_in sin;
|
||||||
|
struct sockaddr_in6 sin6;
|
||||||
|
} saddr;
|
||||||
|
int sport, npaths, i_min, i_max, i;
|
||||||
|
|
||||||
for (i = 0; i < npaths; i++) {
|
if (conn->c_with_sport_idx &&
|
||||||
|
kernel_getpeername(sock, &saddr.addr) >= 0) {
|
||||||
|
/* cp->cp_index is encoded in lowest bits of source-port */
|
||||||
|
switch (saddr.addr.sa_family) {
|
||||||
|
case AF_INET:
|
||||||
|
sport = ntohs(saddr.sin.sin_port);
|
||||||
|
break;
|
||||||
|
case AF_INET6:
|
||||||
|
sport = ntohs(saddr.sin6.sin6_port);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
sport = -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sport = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
npaths = max_t(int, 1, conn->c_npaths);
|
||||||
|
|
||||||
|
if (sport >= 0) {
|
||||||
|
i_min = sport % npaths;
|
||||||
|
i_max = i_min;
|
||||||
|
} else {
|
||||||
|
i_min = 0;
|
||||||
|
i_max = npaths - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = i_min; i <= i_max; i++) {
|
||||||
struct rds_conn_path *cp = &conn->c_path[i];
|
struct rds_conn_path *cp = &conn->c_path[i];
|
||||||
|
|
||||||
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
|
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
|
||||||
RDS_CONN_CONNECTING))
|
RDS_CONN_CONNECTING))
|
||||||
return cp->cp_transport_data;
|
return cp->cp_transport_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -199,7 +232,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn)
|
|||||||
* to and discarded by the sender.
|
* to and discarded by the sender.
|
||||||
* We must not throw those away!
|
* We must not throw those away!
|
||||||
*/
|
*/
|
||||||
rs_tcp = rds_tcp_accept_one_path(conn);
|
rs_tcp = rds_tcp_accept_one_path(conn, new_sock);
|
||||||
if (!rs_tcp) {
|
if (!rs_tcp) {
|
||||||
/* It's okay to stash "new_sock", since
|
/* It's okay to stash "new_sock", since
|
||||||
* "rds_tcp_conn_slots_available" triggers
|
* "rds_tcp_conn_slots_available" triggers
|
||||||
|
|||||||
Reference in New Issue
Block a user