diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 191 | 
1 files changed, 119 insertions, 72 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 83330a6cb242..184ea556f50e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -261,7 +261,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)  		 * cwnd may be very low (even just 1 packet), so we should ACK  		 * immediately.  		 */ -		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; +		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) +			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;  	}  } @@ -517,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);   *   * The algorithm for RTT estimation w/o timestamps is based on   * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://public.lanl.gov/radiant/pubs.html#DRS> + * <https://public.lanl.gov/radiant/pubs.html#DRS>   *   * More detail on this code can be found at   * <http://staff.psc.edu/jheffner/>, @@ -870,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)  	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);  } +struct tcp_sacktag_state { +	/* Timestamps for earliest and latest never-retransmitted segment +	 * that was SACKed. RTO needs the earliest RTT to stay conservative, +	 * but congestion control should still get an accurate delay signal. +	 */ +	u64	first_sackt; +	u64	last_sackt; +	u32	reord; +	u32	sack_delivered; +	int	flag; +	unsigned int mss_now; +	struct rate_sample *rate; +}; +  /* Take a notice that peer is sending D-SACKs */ -static void tcp_dsack_seen(struct tcp_sock *tp) +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, +			  u32 end_seq, struct tcp_sacktag_state *state)  { +	u32 seq_len, dup_segs = 1; + +	if (before(start_seq, end_seq)) { +		seq_len = end_seq - start_seq; +		if (seq_len > tp->mss_cache) +			dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); +	} +  	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;  	tp->rack.dsack_seen = 1; -	tp->dsack_dups++; +	tp->dsack_dups += dup_segs; + +	state->flag |= FLAG_DSACKING_ACK; +	/* A spurious retransmission is delivered */ +	state->sack_delivered += dup_segs; + +	return dup_segs;  }  /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -961,6 +991,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)  	}  } +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, +				bool ece_ack) +{ +	tp->delivered += delivered; +	if (ece_ack) +		tp->delivered_ce += delivered; +} +  /* This procedure tags the retransmission queue when SACKs arrive.   *   * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1093,52 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,  static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  			    struct tcp_sack_block_wire *sp, int num_sacks, -			    u32 prior_snd_una) +			    u32 prior_snd_una, struct tcp_sacktag_state *state)  {  	struct tcp_sock *tp = tcp_sk(sk);  	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);  	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); -	bool dup_sack = false; +	u32 dup_segs;  	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { -		dup_sack = true; -		tcp_dsack_seen(tp);  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);  	} else if (num_sacks > 1) {  		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);  		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); -		if (!after(end_seq_0, end_seq_1) && -		    !before(start_seq_0, start_seq_1)) { -			dup_sack = true; -			tcp_dsack_seen(tp); -			NET_INC_STATS(sock_net(sk), -					LINUX_MIB_TCPDSACKOFORECV); -		} +		if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) +			return false; +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); +	} else { +		return false;  	} +	dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); +	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); +  	/* D-SACK for already forgotten data... Do dumb counting. */ -	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && +	if (tp->undo_marker && tp->undo_retrans > 0 &&  	    !after(end_seq_0, prior_snd_una) &&  	    after(end_seq_0, tp->undo_marker)) -		tp->undo_retrans--; +		tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); -	return dup_sack; +	return true;  } -struct tcp_sacktag_state { -	u32	reord; -	/* Timestamps for earliest and latest never-retransmitted segment -	 * that was SACKed. RTO needs the earliest RTT to stay conservative, -	 * but congestion control should still get an accurate delay signal. -	 */ -	u64	first_sackt; -	u64	last_sackt; -	struct rate_sample *rate; -	int	flag; -	unsigned int mss_now; -}; -  /* Check if skb is fully within the SACK block. In presence of GSO skbs,   * the incoming SACK may not exactly match but we can find smaller MSS   * aligned portion of it that matches. Therefore we might need to fragment @@ -1257,7 +1282,8 @@ static u8 tcp_sacktag_one(struct sock *sk,  		sacked |= TCPCB_SACKED_ACKED;  		state->flag |= FLAG_DATA_SACKED;  		tp->sacked_out += pcount; -		tp->delivered += pcount;  /* Out-of-order packets delivered */ +		/* Out-of-order packets delivered */ +		state->sack_delivered += pcount;  		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */  		if (tp->lost_skb_hint && @@ -1680,11 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,  		tcp_highest_sack_reset(sk);  	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, -					 num_sacks, prior_snd_una); -	if (found_dup_sack) { -		state->flag |= FLAG_DSACKING_ACK; -		tp->delivered++; /* A spurious retransmission is delivered */ -	} +					 num_sacks, prior_snd_una, state);  	/* Eliminate too old ACKs, but take into  	 * account more or less fresh ones, they can @@ -1892,7 +1914,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)  /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)  {  	if (num_dupack) {  		struct tcp_sock *tp = tcp_sk(sk); @@ -1903,20 +1925,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack)  		tcp_check_reno_reordering(sk, 0);  		delivered = tp->sacked_out - prior_sacked;  		if (delivered > 0) -			tp->delivered += delivered; +			tcp_count_delivered(tp, delivered, ece_ack);  		tcp_verify_left_out(tp);  	}  }  /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)  {  	struct tcp_sock *tp = tcp_sk(sk);  	if (acked > 0) {  		/* One ACK acked hole. The rest eat duplicate ACKs. */ -		tp->delivered += max_t(int, acked - tp->sacked_out, 1); +		tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), +				    ece_ack);  		if (acked - 1 >= tp->sacked_out)  			tp->sacked_out = 0;  		else @@ -2696,7 +2719,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,  		 * delivered. Lower inflight to clock out (re)tranmissions.  		 */  		if (after(tp->snd_nxt, tp->high_seq) && num_dupack) -			tcp_add_reno_sack(sk, num_dupack); +			tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);  		else if (flag & FLAG_SND_UNA_ADVANCED)  			tcp_reset_reno_sack(tp);  	} @@ -2778,6 +2801,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	int fast_rexmit = 0, flag = *ack_flag; +	bool ece_ack = flag & FLAG_ECE;  	bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&  				      tcp_force_fast_retransmit(sk)); @@ -2786,7 +2810,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  	/* Now state machine starts.  	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ -	if (flag & FLAG_ECE) +	if (ece_ack)  		tp->prior_ssthresh = 0;  	/* B. In all the states check for reneging SACKs. */ @@ -2827,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  	case TCP_CA_Recovery:  		if (!(flag & FLAG_SND_UNA_ADVANCED)) {  			if (tcp_is_reno(tp)) -				tcp_add_reno_sack(sk, num_dupack); +				tcp_add_reno_sack(sk, num_dupack, ece_ack);  		} else {  			if (tcp_try_undo_partial(sk, prior_snd_una))  				return; @@ -2852,7 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  		if (tcp_is_reno(tp)) {  			if (flag & FLAG_SND_UNA_ADVANCED)  				tcp_reset_reno_sack(tp); -			tcp_add_reno_sack(sk, num_dupack); +			tcp_add_reno_sack(sk, num_dupack, ece_ack);  		}  		if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2876,7 +2900,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  		}  		/* Otherwise enter Recovery state */ -		tcp_enter_recovery(sk, (flag & FLAG_ECE)); +		tcp_enter_recovery(sk, ece_ack);  		fast_rexmit = 1;  	} @@ -2926,6 +2950,8 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,  		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;  		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { +			if (!delta) +				delta = 1;  			seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);  			ca_rtt_us = seq_rtt_us;  		} @@ -3052,7 +3078,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,   */  static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  			       u32 prior_snd_una, -			       struct tcp_sacktag_state *sack) +			       struct tcp_sacktag_state *sack, bool ece_ack)  {  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u64 first_ackt, last_ackt; @@ -3077,8 +3103,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  		u8 sacked = scb->sacked;  		u32 acked_pcount; -		tcp_ack_tstamp(sk, skb, prior_snd_una); -  		/* Determine how many packets and what bytes were acked, tso and else */  		if (after(scb->end_seq, tp->snd_una)) {  			if (tcp_skb_pcount(skb) == 1 || @@ -3113,7 +3137,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  		if (sacked & TCPCB_SACKED_ACKED) {  			tp->sacked_out -= acked_pcount;  		} else if (tcp_is_sack(tp)) { -			tp->delivered += acked_pcount; +			tcp_count_delivered(tp, acked_pcount, ece_ack);  			if (!tcp_skb_spurious_retrans(tp, skb))  				tcp_rack_advance(tp, sacked, scb->end_seq,  						 tcp_skb_timestamp_us(skb)); @@ -3142,6 +3166,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  		if (!fully_acked)  			break; +		tcp_ack_tstamp(sk, skb, prior_snd_una); +  		next = skb_rb_next(skb);  		if (unlikely(skb == tp->retransmit_skb_hint))  			tp->retransmit_skb_hint = NULL; @@ -3157,8 +3183,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))  		tp->snd_up = tp->snd_una; -	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) -		flag |= FLAG_SACK_RENEGING; +	if (skb) { +		tcp_ack_tstamp(sk, skb, prior_snd_una); +		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) +			flag |= FLAG_SACK_RENEGING; +	}  	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {  		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); @@ -3190,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,  		}  		if (tcp_is_reno(tp)) { -			tcp_remove_reno_sacks(sk, pkts_acked); +			tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);  			/* If any of the cumulatively ACKed segments was  			 * retransmitted, non-SACK case cannot confirm that @@ -3487,10 +3516,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)  	}  } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack   */  static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  { @@ -3499,7 +3526,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  	if (before(ack, tp->tlp_high_seq))  		return; -	if (flag & FLAG_DSACKING_ACK) { +	if (!tp->tlp_retrans) { +		/* TLP of new data has been acknowledged */ +		tp->tlp_high_seq = 0; +	} else if (flag & FLAG_DSACKING_ACK) {  		/* This DSACK means original and TLP probe arrived; no loss */  		tp->tlp_high_seq = 0;  	} else if (after(ack, tp->tlp_high_seq)) { @@ -3557,10 +3587,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)  	delivered = tp->delivered - prior_delivered;  	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); -	if (flag & FLAG_ECE) { -		tp->delivered_ce += delivered; +	if (flag & FLAG_ECE)  		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); -	} +  	return delivered;  } @@ -3584,6 +3613,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	sack_state.first_sackt = 0;  	sack_state.rate = &rs; +	sack_state.sack_delivered = 0;  	/* We very likely will need to access rtx queue. */  	prefetch(sk->tcp_rtx_queue.rb_node); @@ -3659,12 +3689,25 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  			ack_ev_flags |= CA_ACK_ECE;  		} +		if (sack_state.sack_delivered) +			tcp_count_delivered(tp, sack_state.sack_delivered, +					    flag & FLAG_ECE); +  		if (flag & FLAG_WIN_UPDATE)  			ack_ev_flags |= CA_ACK_WIN_UPDATE;  		tcp_in_ack_event(sk, ack_ev_flags);  	} +	/* This is a deviation from RFC3168 since it states that: +	 * "When the TCP data sender is ready to set the CWR bit after reducing +	 * the congestion window, it SHOULD set the CWR bit only on the first +	 * new data packet that it transmits." +	 * We accept CWR on pure ACKs to be more robust +	 * with widely-deployed TCP implementations that do this. +	 */ +	tcp_ecn_accept_cwr(sk, skb); +  	/* We passed data and got it acked, remove any soft error  	 * log. Something worked...  	 */ @@ -3675,7 +3718,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		goto no_queue;  	/* See if we can take anything off of the retransmit queue. */ -	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); +	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, +				    flag & FLAG_ECE);  	tcp_rack_update_reo_wnd(sk, &rs); @@ -4416,7 +4460,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)  /**   * tcp_try_coalesce - try to merge skb to prior one   * @sk: socket - * @dest: destination queue   * @to: prior buffer   * @from: buffer to add in queue   * @fragstolen: pointer to boolean @@ -4572,6 +4615,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)  	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); +		sk->sk_data_ready(sk);  		tcp_drop(sk, skb);  		return;  	} @@ -4605,7 +4649,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)  	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,  				 skb, &fragstolen)) {  coalesce_done: -		tcp_grow_window(sk, skb); +		/* For non sack flows, do not grow window to force DUPACK +		 * and trigger fast retransmit. +		 */ +		if (tcp_is_sack(tp)) +			tcp_grow_window(sk, skb);  		kfree_skb_partial(skb, fragstolen);  		skb = NULL;  		goto add_sack; @@ -4689,7 +4737,11 @@ add_sack:  		tcp_sack_new_ofo_skb(sk, seq, end_seq);  end:  	if (skb) { -		tcp_grow_window(sk, skb); +		/* For non sack flows, do not grow window to force DUPACK +		 * and trigger fast retransmit. +		 */ +		if (tcp_is_sack(tp)) +			tcp_grow_window(sk, skb);  		skb_condense(skb);  		skb_set_owner_r(skb, sk);  	} @@ -4792,8 +4844,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)  	skb_dst_drop(skb);  	__skb_pull(skb, tcp_hdr(skb)->doff * 4); -	tcp_ecn_accept_cwr(sk, skb); -  	tp->rx_opt.dsack = 0;  	/*  Queue data for delivery to the user. @@ -4812,6 +4862,7 @@ queue_and_out:  			sk_forced_mem_schedule(sk, skb->truesize);  		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {  			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); +			sk->sk_data_ready(sk);  			goto drop;  		} @@ -6470,7 +6521,6 @@ static void tcp_openreq_init(struct request_sock *req,  	struct inet_request_sock *ireq = inet_rsk(req);  	req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */ -	req->cookie_ts = 0;  	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;  	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;  	tcp_rsk(req)->snt_synack = 0; @@ -6625,6 +6675,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	if (!req)  		goto drop; +	req->syncookie = want_cookie;  	tcp_rsk(req)->af_specific = af_ops;  	tcp_rsk(req)->ts_off = 0;  #if IS_ENABLED(CONFIG_MPTCP) @@ -6652,9 +6703,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	af_ops->init_req(req, sk, skb); -	if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) -		tcp_rsk(req)->is_mptcp = 0; -  	if (security_inet_conn_request(sk, skb, req))  		goto drop_and_free; @@ -6690,7 +6738,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,  	if (want_cookie) {  		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); -		req->cookie_ts = tmp_opt.tstamp_ok;  		if (!tmp_opt.tstamp_ok)  			inet_rsk(req)->ecn_ok = 0;  	}  | 
