Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c0fcfa2..12dd08a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,6 +79,7 @@
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -137,6 +138,69 @@
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
+
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
+
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
+ }
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -437,7 +501,7 @@
/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
-void tcp_init_buffer_space(struct sock *sk)
+static void tcp_init_buffer_space(struct sock *sk)
{
int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
struct tcp_sock *tp = tcp_sk(sk);
@@ -518,7 +582,7 @@
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
* <http://staff.psc.edu/jheffner/>,
@@ -871,12 +935,54 @@
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/* Take a notice that peer is sending D-SACKs */
-static void tcp_dsack_seen(struct tcp_sock *tp)
+struct tcp_sacktag_state {
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ u64 first_sackt;
+ u64 last_sackt;
+ u32 reord;
+ u32 sack_delivered;
+ int flag;
+ unsigned int mss_now;
+ struct rate_sample *rate;
+};
+
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
+ * and spurious retransmission information if this DSACK is unlikely caused by
+ * sender's action:
+ * - DSACKed sequence range is larger than maximum receiver's window.
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
+ */
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
+ u32 end_seq, struct tcp_sacktag_state *state)
{
+ u32 seq_len, dup_segs = 1;
+
+ if (!before(start_seq, end_seq))
+ return 0;
+
+ seq_len = end_seq - start_seq;
+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
+ if (seq_len > tp->max_window)
+ return 0;
+ if (seq_len > tp->mss_cache)
+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
+
+ tp->dsack_dups += dup_segs;
+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
+ if (tp->dsack_dups > tp->total_retrans)
+ return 0;
+
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1;
- tp->dsack_dups++;
+
+ state->flag |= FLAG_DSACKING_ACK;
+ /* A spurious retransmission is delivered */
+ state->sack_delivered += dup_segs;
+
+ return dup_segs;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -914,7 +1020,11 @@
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
-/* This must be called before lost_out is incremented */
+ /* This must be called before lost_out or retrans_out are updated
+ * on a new loss, because we want to know if all skbs previously
+ * known to be lost have already been retransmitted, indicating
+ * that this newly lost skb is our next skb to retransmit.
+ */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
@@ -924,44 +1034,48 @@
tp->retransmit_skb_hint = skb;
}
-/* Sum the number of packets on the wire we have marked as lost.
- * There are two cases we care about here:
- * a) Packet hasn't been marked lost (nor retransmitted),
- * and this is the first loss.
- * b) Packet has been marked both lost and retransmitted,
- * and this means we think it was lost again.
+/* Sum the number of packets on the wire we have marked as lost, and
+ * notify the congestion control module that the given skb was marked lost.
*/
-static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ tp->lost += tcp_skb_pcount(skb);
+}
+
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ struct tcp_sock *tp = tcp_sk(sk);
- if (!(sacked & TCPCB_LOST) ||
- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
- tp->lost += tcp_skb_pcount(skb);
-}
+ if (sacked & TCPCB_SACKED_ACKED)
+ return;
-static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
-{
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tcp_verify_retransmit_hint(tp, skb);
-
- tp->lost_out += tcp_skb_pcount(skb);
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- }
-}
-
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
-{
tcp_verify_retransmit_hint(tp, skb);
-
- tcp_sum_lost(tp, skb);
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ if (sacked & TCPCB_LOST) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ tcp_notify_skb_loss_event(tp, skb);
+ }
+ } else {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tcp_notify_skb_loss_event(tp, skb);
}
}
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+ bool ece_ack)
+{
+ tp->delivered += delivered;
+ if (ece_ack)
+ tp->delivered_ce += delivered;
+}
+
/* This procedure tags the retransmission queue when SACKs arrive.
*
* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1094,52 +1208,43 @@
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
- u32 prior_snd_una)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
- bool dup_sack = false;
+ u32 dup_segs;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
- dup_sack = true;
- tcp_dsack_seen(tp);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
- if (!after(end_seq_0, end_seq_1) &&
- !before(start_seq_0, start_seq_1)) {
- dup_sack = true;
- tcp_dsack_seen(tp);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPDSACKOFORECV);
- }
+ if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
+ return false;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
+ } else {
+ return false;
}
+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
+ if (!dup_segs) { /* Skip dubious DSACK */
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
+ return false;
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
+
/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
- tp->undo_retrans--;
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
- return dup_sack;
+ return true;
}
-struct tcp_sacktag_state {
- u32 reord;
- /* Timestamps for earliest and latest never-retransmitted segment
- * that was SACKed. RTO needs the earliest RTT to stay conservative,
- * but congestion control should still get an accurate delay signal.
- */
- u64 first_sackt;
- u64 last_sackt;
- struct rate_sample *rate;
- int flag;
- unsigned int mss_now;
-};
-
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
* the incoming SACK may not exactly match but we can find smaller MSS
* aligned portion of it that matches. Therefore we might need to fragment
@@ -1258,7 +1363,8 @@
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
- tp->delivered += pcount; /* Out-of-order packets delivered */
+ /* Out-of-order packets delivered */
+ state->sack_delivered += pcount;
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (tp->lost_skb_hint &&
@@ -1425,7 +1531,7 @@
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
- if (!tcp_skb_can_collapse_to(prev))
+ if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
@@ -1514,6 +1620,8 @@
(mss != tcp_skb_seglen(skb)))
goto out;
+ if (!tcp_skb_can_collapse(prev, skb))
+ goto out;
len = skb->len;
pcount = tcp_skb_pcount(skb);
if (tcp_skb_shift(prev, skb, pcount, len))
@@ -1681,11 +1789,7 @@
tcp_highest_sack_reset(sk);
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
- num_sacks, prior_snd_una);
- if (found_dup_sack) {
- state->flag |= FLAG_DSACKING_ACK;
- tp->delivered++; /* A spurious retransmission is delivered */
- }
+ num_sacks, prior_snd_una, state);
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1893,7 +1997,7 @@
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
{
if (num_dupack) {
struct tcp_sock *tp = tcp_sk(sk);
@@ -1904,20 +2008,21 @@
tcp_check_reno_reordering(sk, 0);
delivered = tp->sacked_out - prior_sacked;
if (delivered > 0)
- tp->delivered += delivered;
+ tcp_count_delivered(tp, delivered, ece_ack);
tcp_verify_left_out(tp);
}
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
+ ece_ack);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -2184,8 +2289,7 @@
}
/* Detect loss in event "A" above by marking head of queue up as lost.
- * For non-SACK(Reno) senders, the first "packets" number of segments
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
*/
@@ -2193,10 +2297,9 @@
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt, oldcnt, lost;
- unsigned int mss;
+ int cnt;
/* Use SACK to deduce losses of new sequences sent during recovery */
- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
+ const u32 loss_high = tp->snd_nxt;
WARN_ON(packets > tp->packets_out);
skb = tp->lost_skb_hint;
@@ -2219,28 +2322,14 @@
if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
- oldcnt = cnt;
- if (tcp_is_reno(tp) ||
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
cnt += tcp_skb_pcount(skb);
- if (cnt > packets) {
- if (tcp_is_sack(tp) ||
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
- (oldcnt >= packets))
- break;
+ if (cnt > packets)
+ break;
- mss = tcp_skb_mss(skb);
- /* If needed, chop off the prefix to mark as lost. */
- lost = (packets - oldcnt) * mss;
- if (lost < skb->len &&
- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
- lost, mss, GFP_ATOMIC) < 0)
- break;
- cnt = packets;
- }
-
- tcp_skb_mark_lost(tp, skb);
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
+ tcp_mark_skb_lost(sk, skb);
if (mark_head)
break;
@@ -2606,14 +2695,8 @@
unsigned int mss = tcp_current_mss(sk);
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
- if (tcp_skb_seglen(skb) > mss &&
- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- }
+ if (tcp_skb_seglen(skb) > mss)
+ tcp_mark_skb_lost(sk, skb);
}
tcp_clear_retrans_hints_partial(tp);
@@ -2714,15 +2797,24 @@
* delivered. Lower inflight to clock out (re)tranmissions.
*/
if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
- tcp_add_reno_sack(sk, num_dupack);
+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
*rexmit = REXMIT_LOST;
}
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return after(tcp_highest_sack_seq(tp),
+ tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
+ bool *do_lost)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2747,7 +2839,9 @@
tcp_undo_cwnd_reduction(sk, true);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
- return true;
+ } else {
+ /* Partial ACK arrived. Force fast retransmit. */
+ *do_lost = tcp_force_fast_retransmit(sk);
}
return false;
}
@@ -2771,14 +2865,6 @@
}
}
-static bool tcp_force_fast_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- return after(tcp_highest_sack_seq(tp),
- tp->snd_una + tp->reordering * tp->mss_cache);
-}
-
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2797,6 +2883,7 @@
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
+ bool ece_ack = flag & FLAG_ECE;
bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
tcp_force_fast_retransmit(sk));
@@ -2805,7 +2892,7 @@
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
- if (flag & FLAG_ECE)
+ if (ece_ack)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
@@ -2846,19 +2933,22 @@
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
- tcp_add_reno_sack(sk, num_dupack);
- } else {
- if (tcp_try_undo_partial(sk, prior_snd_una))
- return;
- /* Partial ACK arrived. Force fast retransmit. */
- do_lost = tcp_is_reno(tp) ||
- tcp_force_fast_retransmit(sk);
- }
- if (tcp_try_undo_dsack(sk)) {
- tcp_try_keep_open(sk);
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
+ } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
return;
- }
+
+ if (tcp_try_undo_dsack(sk))
+ tcp_try_keep_open(sk);
+
tcp_identify_packet_loss(sk, ack_flag);
+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
+ if (!tcp_time_to_recover(sk, flag))
+ return;
+ /* Undo reverts the recovery state. If loss is evident,
+ * starts a new recovery (e.g. reordering then loss);
+ */
+ tcp_enter_recovery(sk, ece_ack);
+ }
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, num_dupack, rexmit);
@@ -2867,12 +2957,12 @@
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
- /* fall through */
+ fallthrough;
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- tcp_add_reno_sack(sk, num_dupack);
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
@@ -2896,7 +2986,7 @@
}
/* Otherwise enter Recovery state */
- tcp_enter_recovery(sk, (flag & FLAG_ECE));
+ tcp_enter_recovery(sk, ece_ack);
fast_rexmit = 1;
}
@@ -3018,7 +3108,7 @@
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
- TCP_RTO_MAX, tcp_rtx_queue_head(sk));
+ TCP_RTO_MAX);
}
}
@@ -3074,7 +3164,7 @@
*/
static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
u32 prior_snd_una,
- struct tcp_sacktag_state *sack)
+ struct tcp_sacktag_state *sack, bool ece_ack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u64 first_ackt, last_ackt;
@@ -3099,8 +3189,6 @@
u8 sacked = scb->sacked;
u32 acked_pcount;
- tcp_ack_tstamp(sk, skb, prior_snd_una);
-
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
@@ -3135,7 +3223,7 @@
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
} else if (tcp_is_sack(tp)) {
- tp->delivered += acked_pcount;
+ tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
tcp_skb_timestamp_us(skb));
@@ -3164,6 +3252,8 @@
if (!fully_acked)
break;
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
+
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
@@ -3179,8 +3269,11 @@
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- flag |= FLAG_SACK_RENEGING;
+ if (skb) {
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ flag |= FLAG_SACK_RENEGING;
+ }
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
@@ -3212,7 +3305,7 @@
}
if (tcp_is_reno(tp)) {
- tcp_remove_reno_sacks(sk, pkts_acked);
+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
/* If any of the cumulatively ACKed segments was
* retransmitted, non-SACK case cannot confirm that
@@ -3296,8 +3389,7 @@
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
when = tcp_clamp_probe0_to_user_timeout(sk, when);
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- when, TCP_RTO_MAX, NULL);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
}
}
@@ -3563,7 +3655,7 @@
if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
- if (unlikely(rexmit == 2)) {
+ if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
@@ -3582,10 +3674,9 @@
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE) {
- tp->delivered_ce += delivered;
+ if (flag & FLAG_ECE)
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
- }
+
return delivered;
}
@@ -3609,6 +3700,7 @@
sack_state.first_sackt = 0;
sack_state.rate = &rs;
+ sack_state.sack_delivered = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3684,6 +3776,10 @@
ack_ev_flags |= CA_ACK_ECE;
}
+ if (sack_state.sack_delivered)
+ tcp_count_delivered(tp, sack_state.sack_delivered,
+ flag & FLAG_ECE);
+
if (flag & FLAG_WIN_UPDATE)
ack_ev_flags |= CA_ACK_WIN_UPDATE;
@@ -3709,7 +3805,8 @@
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
+ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
+ flag & FLAG_ECE);
tcp_rack_update_reo_wnd(sk, &rs);
@@ -3792,7 +3889,7 @@
foc->exp = exp_opt;
}
-static void smc_parse_options(const struct tcphdr *th,
+static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
@@ -3801,10 +3898,13 @@
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+ return true;
+ }
}
#endif
+ return false;
}
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
@@ -3865,6 +3965,7 @@
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->saw_unknown = 0;
while (length > 0) {
int opcode = *ptr++;
@@ -3955,15 +4056,21 @@
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
- TCPOPT_FASTOPEN_MAGIC)
+ TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else
- smc_parse_options(th, opt_rx, ptr,
- opsize);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
break;
+ default:
+ opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
@@ -4284,8 +4391,9 @@
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
- sk_rethink_txhash(sk);
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+ sk_rethink_txhash(sk))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4334,10 +4442,38 @@
sp[i] = sp[i + 1];
continue;
}
- this_sack++, swalk++;
+ this_sack++;
+ swalk++;
}
}
+static void tcp_sack_compress_send_ack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->compressed_ack)
+ return;
+
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
+
+ /* Since we have to send one ack finally,
+ * substract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack - 1);
+
+ tp->compressed_ack = 0;
+ tcp_send_ack(sk);
+}
+
+/* Reasonable amount of sack blocks included in TCP SACK option
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
+ * Given that SACK packets might be lost, be conservative and use 2.
+ */
+#define TCP_SACK_BLOCKS_EXPECTED 2
+
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4350,6 +4486,8 @@
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
swap(*sp, *(sp - 1));
@@ -4359,6 +4497,9 @@
}
}
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
+
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
@@ -4366,8 +4507,6 @@
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
- tcp_send_ack(sk);
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
@@ -4419,7 +4558,6 @@
/**
* tcp_try_coalesce - try to merge skb to prior one
* @sk: socket
- * @dest: destination queue
* @to: prior buffer
* @from: buffer to add in queue
* @fragstolen: pointer to boolean
@@ -4443,6 +4581,9 @@
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+ if (!mptcp_skb_can_collapse(to, from))
+ return false;
+
#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
@@ -4792,6 +4933,9 @@
bool fragstolen;
int eaten;
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
+
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
@@ -4962,7 +5106,7 @@
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
@@ -4971,7 +5115,7 @@
break;
}
- if (n && n != tail &&
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
@@ -5004,6 +5148,7 @@
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -5023,6 +5168,7 @@
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
+ !mptcp_skb_can_collapse(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
#ifdef CONFIG_TLS_DEVICE
@@ -5212,12 +5358,6 @@
return true;
}
-/* When incoming ACK allowed to free some skb from write_queue,
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
- * on the exit from tcp input handler.
- *
- * PROBLEM: sndbuf expansion does not work well with largesend.
- */
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5232,16 +5372,13 @@
static void tcp_check_space(struct sock *sk)
{
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ /* pairs with tcp_poll() */
+ smp_mb();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -5288,15 +5425,13 @@
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
- tp->compressed_ack = 0;
+ tp->dup_ack_counter = 0;
}
-
- if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
+ tp->dup_ack_counter++;
goto send_now;
-
+ }
+ tp->compressed_ack++;
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
@@ -5309,8 +5444,9 @@
delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_SOFT);
+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
+ sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
@@ -5544,6 +5680,8 @@
goto discard;
}
+ bpf_skops_parse_hdr(sk, skb);
+
return true;
discard:
@@ -5754,7 +5892,7 @@
}
EXPORT_SYMBOL(tcp_rcv_established);
-void tcp_init_transfer(struct sock *sk, int bpf_op)
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5775,8 +5913,10 @@
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
- tcp_call_bpf(sk, bpf_op, 0, NULL);
- tcp_init_congestion_control(sk);
+ bpf_skops_established(sk, bpf_op, skb);
+ /* Initialize congestion control unless BPF initialized it already: */
+ if (!icsk->icsk_ca_initialized)
+ tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
@@ -5794,7 +5934,7 @@
sk_mark_napi_id(sk, skb);
}
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -5849,6 +5989,10 @@
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data) {
if (__tcp_retransmit_skb(sk, data, 1))
break;
@@ -5919,8 +6063,14 @@
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ inet_csk_reset_xmit_timer(sk,
+ ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
@@ -6256,7 +6406,8 @@
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
@@ -6364,9 +6515,12 @@
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
break;
- /* fall through */
+ }
+ fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
@@ -6381,7 +6535,7 @@
return 1;
}
}
- /* Fall through */
+ fallthrough;
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
@@ -6464,7 +6618,6 @@
struct inet_request_sock *ireq = inet_rsk(req);
req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
- req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = 0;
@@ -6543,13 +6696,27 @@
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
- u32 *copy;
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
- if (copy) {
- copy[0] = len;
- memcpy(©[1], skb_network_header(skb), len);
- req->saved_syn = copy;
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
+
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
}
}
}
@@ -6619,8 +6786,12 @@
if (!req)
goto drop;
+ req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6678,13 +6849,13 @@
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
@@ -6693,7 +6864,7 @@
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, TCP_SYNACK_FASTOPEN);
+ &foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
@@ -6711,7 +6882,8 @@
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
- TCP_SYNACK_COOKIE);
+ TCP_SYNACK_COOKIE,
+ skb);
if (want_cookie) {
reqsk_free(req);
return 0;