diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..75db2251649b 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -131,10 +131,7 @@ Contents: vxlan x25 x25-iface - xfrm_device - xfrm_proc - xfrm_sync - xfrm_sysctl + xfrm/index xdp-rx-metadata xsk-tx-metadata diff --git a/Documentation/networking/xfrm/index.rst b/Documentation/networking/xfrm/index.rst new file mode 100644 index 000000000000..7d866da836fe --- /dev/null +++ b/Documentation/networking/xfrm/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +XFRM Framework +============== + +.. toctree:: + :maxdepth: 2 + + xfrm_device + xfrm_proc + xfrm_sync + xfrm_sysctl diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm/xfrm_device.rst similarity index 95% rename from Documentation/networking/xfrm_device.rst rename to Documentation/networking/xfrm/xfrm_device.rst index 122204da0fff..b0d85a5f57d1 100644 --- a/Documentation/networking/xfrm_device.rst +++ b/Documentation/networking/xfrm/xfrm_device.rst @@ -20,11 +20,15 @@ can radically increase throughput and decrease CPU utilization. The XFRM Device interface allows NIC drivers to offer to the stack access to the hardware offload. -Right now, there are two types of hardware offload that kernel supports. +Right now, there are two types of hardware offload that kernel supports: + * IPsec crypto offload: + * NIC performs encrypt/decrypt * Kernel does everything else + * IPsec packet offload: + * NIC performs encrypt/decrypt * NIC does encapsulation * Kernel and NIC have SA and policy in-sync @@ -34,7 +38,7 @@ Right now, there are two types of hardware offload that kernel supports. Userland access to the offload is typically through a system such as libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can be handy when experimenting. An example command might look something -like this for crypto offload: +like this for crypto offload:: ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \ reqid 0x07 replay-window 32 \ @@ -42,7 +46,7 @@ like this for crypto offload: sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \ offload dev eth4 dir in -and for packet offload +and for packet offload:: ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \ reqid 0x07 replay-window 32 \ @@ -153,26 +157,26 @@ the packet's skb. At this point the data should be decrypted but the IPsec headers are still in the packet data; they are removed later up the stack in xfrm_input(). - find and hold the SA that was used to the Rx skb:: +1. Find and hold the SA that was used to the Rx skb:: - get spi, protocol, and destination IP from packet headers + /* get spi, protocol, and destination IP from packet headers */ xs = find xs from (spi, protocol, dest_IP) xfrm_state_hold(xs); - store the state information into the skb:: +2. Store the state information into the skb:: sp = secpath_set(skb); if (!sp) return; sp->xvec[sp->len++] = xs; sp->olen++; - indicate the success and/or error status of the offload:: +3. Indicate the success and/or error status of the offload:: xo = xfrm_offload(skb); xo->flags = CRYPTO_DONE; xo->status = crypto_status; - hand the packet to napi_gro_receive() as usual +4. Hand the packet to napi_gro_receive() as usual. In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn() for RX, and xfrm_replay_overflow_offload_esn for TX. diff --git a/Documentation/networking/xfrm_proc.rst b/Documentation/networking/xfrm/xfrm_proc.rst similarity index 100% rename from Documentation/networking/xfrm_proc.rst rename to Documentation/networking/xfrm/xfrm_proc.rst diff --git a/Documentation/networking/xfrm_sync.rst b/Documentation/networking/xfrm/xfrm_sync.rst similarity index 64% rename from Documentation/networking/xfrm_sync.rst rename to Documentation/networking/xfrm/xfrm_sync.rst index 6246503ceab2..dfc2ec0df380 100644 --- a/Documentation/networking/xfrm_sync.rst +++ b/Documentation/networking/xfrm/xfrm_sync.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -==== -XFRM -==== +========= +XFRM sync +========= The sync patches work is based on initial patches from Krisztian and others and additional patches @@ -36,7 +36,7 @@ is not driven by packet arrival. - the replay sequence for both inbound and outbound 1) Message Structure ----------------------- +-------------------- nlmsghdr:aevent_id:optional-TLVs. @@ -83,31 +83,31 @@ when going from kernel to user space) A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS to get notified of these events. -2) TLVS reflect the different parameters: ------------------------------------------ +2) TLVS reflect the different parameters +---------------------------------------- a) byte value (XFRMA_LTIME_VAL) -This TLV carries the running/current counter for byte lifetime since -last event. + This TLV carries the running/current counter for byte lifetime since + last event. -b)replay value (XFRMA_REPLAY_VAL) +b) replay value (XFRMA_REPLAY_VAL) -This TLV carries the running/current counter for replay sequence since -last event. + This TLV carries the running/current counter for replay sequence since + last event. -c)replay threshold (XFRMA_REPLAY_THRESH) +c) replay threshold (XFRMA_REPLAY_THRESH) -This TLV carries the threshold being used by the kernel to trigger events -when the replay sequence is exceeded. + This TLV carries the threshold being used by the kernel to trigger events + when the replay sequence is exceeded. d) expiry timer (XFRMA_ETIMER_THRESH) -This is a timer value in milliseconds which is used as the nagle -value to rate limit the events. + This is a timer value in milliseconds which is used as the nagle + value to rate limit the events. -3) Default configurations for the parameters: ---------------------------------------------- +3) Default configurations for the parameters +-------------------------------------------- By default these events should be turned off unless there is at least one listener registered to listen to the multicast @@ -121,12 +121,14 @@ in case they are not specified. the two sysctls/proc entries are: a) /proc/sys/net/core/sysctl_xfrm_aevent_etime -used to provide default values for the XFRMA_ETIMER_THRESH in incremental -units of time of 100ms. The default is 10 (1 second) + + Used to provide default values for the XFRMA_ETIMER_THRESH in incremental + units of time of 100ms. The default is 10 (1 second) b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth -used to provide default values for XFRMA_REPLAY_THRESH parameter -in incremental packet count. The default is two packets. + + Used to provide default values for XFRMA_REPLAY_THRESH parameter + in incremental packet count. The default is two packets. 4) Message types ---------------- @@ -134,50 +136,51 @@ in incremental packet count. The default is two packets. a) XFRM_MSG_GETAE issued by user-->kernel. XFRM_MSG_GETAE does not carry any TLVs. -The response is a XFRM_MSG_NEWAE which is formatted based on what -XFRM_MSG_GETAE queried for. + The response is a XFRM_MSG_NEWAE which is formatted based on what + XFRM_MSG_GETAE queried for. -The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved -* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved + The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + + * if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved + * if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved b) XFRM_MSG_NEWAE is issued by either user space to configure or kernel to announce events or respond to a XFRM_MSG_GETAE. -i) user --> kernel to configure a specific SA. + i) user --> kernel to configure a specific SA. -any of the values or threshold parameters can be updated by passing the -appropriate TLV. + any of the values or threshold parameters can be updated by passing the + appropriate TLV. -A response is issued back to the sender in user space to indicate success -or failure. + A response is issued back to the sender in user space to indicate success + or failure. -In the case of success, additionally an event with -XFRM_MSG_NEWAE is also issued to any listeners as described in iii). + In the case of success, additionally an event with + XFRM_MSG_NEWAE is also issued to any listeners as described in iii). -ii) kernel->user direction as a response to XFRM_MSG_GETAE + ii) kernel->user direction as a response to XFRM_MSG_GETAE -The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -The threshold TLVs will be included if explicitly requested in -the XFRM_MSG_GETAE message. + The threshold TLVs will be included if explicitly requested in + the XFRM_MSG_GETAE message. -iii) kernel->user to report as event if someone sets any values or - thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above). - In such a case XFRM_AE_CU flag is set to inform the user that - the change happened as a result of an update. - The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. + iii) kernel->user to report as event if someone sets any values or + thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above). + In such a case XFRM_AE_CU flag is set to inform the user that + the change happened as a result of an update. + The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -iv) kernel->user to report event when replay threshold or a timeout - is exceeded. + iv) kernel->user to report event when replay threshold or a timeout + is exceeded. In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout happened) is set to inform the user what happened. Note the two flags are mutually exclusive. The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs. -Exceptions to threshold settings --------------------------------- +5) Exceptions to threshold settings +----------------------------------- If you have an SA that is getting hit by traffic in bursts such that there is a period where the timer threshold expires with no packets diff --git a/Documentation/networking/xfrm_sysctl.rst b/Documentation/networking/xfrm/xfrm_sysctl.rst similarity index 68% rename from Documentation/networking/xfrm_sysctl.rst rename to Documentation/networking/xfrm/xfrm_sysctl.rst index 47b9bbdd0179..7d0c4b17c0bd 100644 --- a/Documentation/networking/xfrm_sysctl.rst +++ b/Documentation/networking/xfrm/xfrm_sysctl.rst @@ -4,8 +4,8 @@ XFRM Syscall ============ -/proc/sys/net/core/xfrm_* Variables: -==================================== +/proc/sys/net/core/xfrm_* Variables +=================================== xfrm_acq_expires - INTEGER default 30 - hard timeout in seconds for acquire requests diff --git a/MAINTAINERS b/MAINTAINERS index 37f4278db851..b0223cfb2583 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18068,6 +18068,7 @@ L: netdev@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git +F: Documentation/networking/xfrm/ F: include/net/xfrm.h F: include/uapi/linux/xfrm.h F: net/ipv4/ah4.c diff --git a/net/key/af_key.c b/net/key/af_key.c index 2ebde0352245..571200433aa9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3903,6 +3903,8 @@ static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); + pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, " + "please contact the netdev mailing list\n"); if (err != 0) goto out; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index f0157702718f..4a62817a88f8 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -110,14 +110,17 @@ config XFRM_IPCOMP select CRYPTO_DEFLATE config NET_KEY - tristate "PF_KEY sockets" + tristate "PF_KEY sockets (deprecated)" select XFRM_ALGO help PF_KEYv2 socket family, compatible to KAME ones. - They are required if you are going to use IPsec tools ported - from KAME. - Say Y unless you know what you are doing. + The PF_KEYv2 socket interface is deprecated and + scheduled for removal. All maintained IKE daemons + no longer need PF_KEY sockets. Please use the netlink + interface (XFRM_USER) to configure IPsec. + + If unsure, say N. config NET_KEY_MIGRATE bool "PF_KEY MIGRATE" diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c9ddef869aa5..4ed346e682c7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) async = 1; dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; + spin_lock(&x->lock); goto resume; } /* GRO call */ @@ -541,9 +542,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + + nexthdr = x->type_offload->input_tail(x, skb); } - goto lock; + goto process; } family = XFRM_SPI_SKB_CB(skb)->family; @@ -611,7 +614,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } -lock: +process: + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); + + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { @@ -638,21 +646,13 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop_unlock; } - spin_unlock(&x->lock); - if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; + goto drop_unlock; } - seq_hi = htonl(xfrm_replay_seqhi(x, seq)); - - XFRM_SKB_CB(skb)->seq.input.low = seq; - XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - - if (crypto_done) { - nexthdr = x->type_offload->input_tail(x, skb); - } else { + if (!crypto_done) { + spin_unlock(&x->lock); dev_hold(skb->dev); nexthdr = x->type->input(x, skb); @@ -660,9 +660,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) return 0; dev_put(skb->dev); + spin_lock(&x->lock); } resume: - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, @@ -676,7 +676,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) /* only the first xfrm gets the encap type */ encap_type = 0; - if (xfrm_replay_recheck(x, skb, seq)) { + if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; }