diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 399 |
1 files changed, 221 insertions, 178 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4811727b8a02..8ec92dec321a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -102,6 +102,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ +#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -118,18 +119,18 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { - icsk->icsk_clean_acked = cad; + tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); -void clean_acked_data_disable(struct inet_connection_sock *icsk) +void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); - icsk->icsk_clean_acked = NULL; + tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); @@ -169,6 +170,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +187,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) @@ -243,9 +246,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -325,15 +334,14 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - const struct dst_entry *dst = __sk_dst_get(sk); - return (dst && dst_metric(dst, RTAX_QUICKACK)) || + return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -356,10 +364,13 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } -static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tcp_ecn_disabled(tp)) + return; + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -388,31 +399,39 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) - __tcp_ecn_check_ce(sk, skb); -} - static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -630,7 +649,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) inet_csk(sk)->icsk_ack.rcv_mss = hint; } -EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * @@ -645,10 +664,12 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt_us; - long m = sample; + u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; + long m = sample << 3; - if (new_sample != 0) { + if (old_sample == 0 || m < old_sample) { + new_sample = m; + } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which @@ -659,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * else with timestamps disabled convergence takes too * long. */ - if (!win_dep) { - m -= (new_sample >> 3); - new_sample += m; - } else { - m <<= 3; - if (m < new_sample) - new_sample = m; - } - } else { - /* No previous measure. */ - new_sample = m << 3; + if (win_dep) + return; + /* Do not use this sample if receive queue is not empty. */ + if (tp->rcv_nxt != tp->copied_seq) + return; + new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; @@ -693,7 +709,7 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } -static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; @@ -703,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) - delta = 1; + delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } @@ -721,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - s32 delta = tcp_rtt_tsopt_us(tp); + s32 delta = tcp_rtt_tsopt_us(tp, 0); - if (delta >= 0) + if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } +static void tcp_rcvbuf_grow(struct sock *sk) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return; + + /* slow start: allow the sender to double its rate. */ + rcvwin = tp->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) + rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } +} /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. @@ -735,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 copied; - int time; + int time, inq, copied; trace_tcp_rcv_space_adjust(sk); @@ -747,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk) /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; + /* Number of bytes in receive queue. */ + inq = tp->rcv_nxt - tp->copied_seq; + copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; - /* A bit of theory : - * copied = bytes received in previous RTT, our base window - * To cope with packet losses, we need a 2x factor - * To cope with slow start, and sender growing its cwin by 100 % - * every RTT, we need a 4x factor, because the ACK we are sending - * now is for the next RTT, not the current one : - * <prev RTT . ><current RTT .. ><next RTT .... > - */ - - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; + trace_tcp_rcvbuf_grow(sk, time); - /* minimal window to cope with packet losses, assuming - * steady state. Add some cushion because of small variations. - */ - rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - - /* Accommodate for sender rate increase (eg. slow start) */ - grow = rcvwin * (copied - tp->rcvq_space.space); - do_div(grow, tp->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } tp->rcvq_space.space = copied; + tcp_rcvbuf_grow(sk); + new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; @@ -851,7 +865,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); @@ -1148,15 +1162,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2252,8 +2257,7 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } @@ -2710,6 +2714,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; + trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); + tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + @@ -2892,7 +2898,7 @@ void tcp_simple_retransmit(struct sock *sk) */ tcp_non_congestion_loss_retransmit(sk); } -EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { @@ -3215,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3282,8 +3288,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } @@ -3560,10 +3565,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } @@ -3808,8 +3813,16 @@ static void tcp_store_ts_recent(struct tcp_sock *tp) tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } -static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta) { + tcp_store_ts_recent(tp); + return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0; +} + +static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + s32 delta; + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen @@ -3818,9 +3831,13 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); + if (tcp_paws_check(&tp->rx_opt, 0)) { + delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; + return __tcp_replace_ts_recent(tp, delta); + } } + + return 0; } /* This routine deals with acks during a TLP episode and ends an episode by @@ -3856,12 +3873,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3954,8 +3982,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) - if (icsk->icsk_clean_acked) - icsk->icsk_clean_acked(sk, ack); + if (tp->tcp_clean_acked) + tp->tcp_clean_acked(sk, ack); #endif } @@ -3966,7 +3994,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * is in window. */ if (flag & FLAG_UPDATE_TS_RECENT) - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { @@ -3978,12 +4006,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3995,19 +4019,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -4034,6 +4051,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4065,6 +4084,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, @@ -4174,7 +4194,6 @@ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } -EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -4450,34 +4469,40 @@ static u32 tcp_tsval_replay(const struct sock *sk) return inet_csk(sk)->icsk_rto * 1200 / HZ; } -static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) +static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, + const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcphdr *th = tcp_hdr(skb); - u32 seq = TCP_SKB_CB(skb)->seq; + SKB_DR_INIT(reason, TCP_RFC7323_PAWS); u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 seq = TCP_SKB_CB(skb)->seq; - return /* 1. Pure ACK with correct sequence number. */ - (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + /* 1. Is this not a pure ACK ? */ + if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq) + return reason; - /* 2. ... and duplicate ACK. */ - ack == tp->snd_una && + /* 2. Is its sequence not the expected one ? */ + if (seq != tp->rcv_nxt) + return before(seq, tp->rcv_nxt) ? + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK : + reason; - /* 3. ... and does not update window. */ - !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + /* 3. Is this not a duplicate ACK ? */ + if (ack != tp->snd_una) + return reason; - /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= - tcp_tsval_replay(sk); -} + /* 4. Is this updating the window ? */ + if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) << + tp->rx_opt.snd_wscale)) + return reason; -static inline bool tcp_paws_discard(const struct sock *sk, - const struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); + /* 5. Is this not in the replay window ? */ + if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > + tcp_tsval_replay(sk)) + return reason; - return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && - !tcp_disordered_ack(sk, skb); + return 0; } /* Check segment sequence number for validity. @@ -4518,7 +4543,7 @@ void tcp_done_with_error(struct sock *sk, int err) if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } -EXPORT_SYMBOL(tcp_done_with_error); +EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) @@ -4964,7 +4989,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5007,7 +5032,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) bool fragstolen; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -5143,6 +5168,7 @@ end: skb_condense(skb); skb_set_owner_r(skb, sk); } + tcp_rcvbuf_grow(sk); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, @@ -5156,7 +5182,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5239,7 +5265,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -5949,23 +5975,35 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && - tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - if (unlikely(th->syn)) - goto syn_challenge; - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - if (!tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDPAWS, - &tp->last_oow_ack_time)) - tcp_send_dupack(sk, skb); - SKB_DR_SET(reason, TCP_RFC7323_PAWS); - goto discard; - } - /* Reset is accepted even if it did not pass PAWS. */ + if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) || + !tp->rx_opt.saw_tstamp || + tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW)) + goto step1; + + reason = tcp_disordered_ack_check(sk, skb); + if (!reason) + goto step1; + /* Reset is accepted even if it did not pass PAWS. */ + if (th->rst) + goto step1; + if (unlikely(th->syn)) + goto syn_challenge; + + /* Old ACK are common, increment PAWS_OLD_ACK + * and do not send a dupack. + */ + if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK); + goto discard; } + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDPAWS, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); + goto discard; +step1: /* Step 1: check sequence number */ reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { @@ -6132,6 +6170,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq == tp->rcv_nxt && !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; + s32 delta = 0; + int flag = 0; /* Timestamp header prediction: tcp_header_len * is automatically equal to th->doff*4 due to pred_flags @@ -6144,8 +6184,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; + delta = tp->rx_opt.rcv_tsval - + tp->rx_opt.ts_recent; /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + if (delta < 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails @@ -6165,12 +6207,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); /* We know that such packets are checksummed * on entry. */ - tcp_ack(sk, skb, 0); + tcp_ack(sk, skb, flag); __kfree_skb(skb); tcp_data_snd_check(sk); /* When receiving pure ack in fast path, update @@ -6201,14 +6244,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -6216,7 +6260,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); + tcp_ack(sk, skb, flag | FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; @@ -6276,7 +6320,7 @@ csum_error: discard: tcp_drop_reason(sk, skb, reason); } -EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -6329,7 +6373,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -6452,9 +6496,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) - inet_csk_reset_xmit_timer(sk, - ICSK_TIME_RETRANS, - TCP_TIMEOUT_MIN, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } @@ -6569,8 +6612,8 @@ consume: */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, false); goto consume; } tcp_send_ack(sk); @@ -6788,10 +6831,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { @@ -6827,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { @@ -6852,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6904,7 +6946,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -6912,7 +6954,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * if it spins in bh_lock_sock(), but it is really * marginal case. */ - inet_csk_reset_keepalive_timer(sk, tmo); + tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; @@ -6990,7 +7032,7 @@ consume: __kfree_skb(skb); return 0; } -EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { @@ -7057,6 +7099,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -7172,7 +7215,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return mss; } -EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); +EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, @@ -7353,4 +7396,4 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_conn_request); +EXPORT_IPV6_MOD(tcp_conn_request); |