From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8b..8416f8a68e65 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); int res; + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* * Header prediction. * The code loosely follows the one in the famous @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); -- cgit From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:21:03 -0700 Subject: net: Do delayed neigh confirmation. When a dst_confirm() happens, mark the confirmation as pending in the dst. Then on the next packet out, when we have the neigh in-hand, do the update. This removes the dependency in dst_confirm() of dst's having an attached neigh. While we're here, remove the explicit 'dst' NULL check, all except 2 or 3 call sites ensure it's not NULL. So just fix those cases up. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8416f8a68e65..ca0d0e7c9778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,13 +740,13 @@ void tcp_update_metrics(struct sock *sk) if (sysctl_tcp_nometrics_save) return; - dst_confirm(dst); - if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; unsigned long rtt; + dst_confirm(dst); + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. @@ -3869,9 +3869,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) - dst_confirm(__sk_dst_get(sk)); - + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { + struct dst_entry *dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + } return 1; no_queue: @@ -6140,9 +6142,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { + struct dst_entry *dst; + tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst_confirm(__sk_dst_get(sk)); + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ -- cgit From 4aabd8ef8c43677cfee3e1e36c5a79edddb41942 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Jul 2012 16:07:30 -0700 Subject: tcp: Move dynamnic metrics handling into seperate file. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 188 +-------------------------------------------------- 1 file changed, 2 insertions(+), 186 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca0d0e7c9778..055ac49b8b40 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; @@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk) tcp_bound_rto(sk); } -/* Save metrics learned by this TCP session. - This function is called only, when TCP finishes successfully - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (sysctl_tcp_nometrics_save) - return; - - if (dst && (dst->flags & DST_HOST)) { - const struct inet_connection_sock *icsk = inet_csk(sk); - int m; - unsigned long rtt; - - dst_confirm(dst); - - if (icsk->icsk_backoff || !tp->srtt) { - /* This session failed to estimate rtt. Why? - * Probably, no packets returned in time. - * Reset our results. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) - dst_metric_set(dst, RTAX_RTT, 0); - return; - } - - rtt = dst_metric_rtt(dst, RTAX_RTT); - m = rtt - tp->srtt; - - /* If newly calculated rtt larger than stored one, - * store new one. Otherwise, use EWMA. Remember, - * rtt overestimation is always better than underestimation. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) { - if (m <= 0) - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); - else - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); - } - - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { - unsigned long var; - if (m < 0) - m = -m; - - /* Scale deviation to rttvar fixed point */ - m >>= 1; - if (m < tp->mdev) - m = tp->mdev; - - var = dst_metric_rtt(dst, RTAX_RTTVAR); - if (m >= var) - var = m; - else - var -= (var - m) >> 2; - - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); - } - - if (tcp_in_initial_slowstart(tp)) { - /* Slow start still did not finish. */ - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); - if (!dst_metric_locked(dst, RTAX_CWND) && - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); - } else if (tp->snd_cwnd > tp->snd_ssthresh && - icsk->icsk_ca_state == TCP_CA_Open) { - /* Cong. avoidance phase, cwnd is reliable. */ - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_cwnd) >> 1); - } else { - /* Else slow start did not finish, cwnd is non-sense, - ssthresh may be also invalid. - */ - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_ssthresh) >> 1); - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); - } - - if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && - tp->reordering != sysctl_tcp_reordering) - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); - } - } -} - __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) @@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (dst == NULL) - goto reset; - - dst_confirm(dst); - - if (dst_metric_locked(dst, RTAX_CWND)) - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - } - if (dst_metric(dst, RTAX_REORDERING) && - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); - tp->reordering = dst_metric(dst, RTAX_REORDERING); - } - - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) - goto reset; - - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); - tp->rtt_seq = tp->snd_nxt; - } - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { - /* RFC6298: 5.7 We've failed to get a valid RTT sample from - * 3WHS. This is most likely due to retransmission, - * including spurious one. Reset the RTO back to 3secs - * from the more aggressive 1sec to avoid more spurious - * retransmission. - */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; - } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { -- cgit From a6df1ae9383697c4eb1365176002f154982325ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Jul 2012 01:41:36 +0000 Subject: tcp: add OFO snmp counters Add three SNMP TCP counters, to better track TCP behavior at global stage (netstat -s), when packets are received Out Of Order (OFO) TCPOFOQueue : Number of packets queued in OFO queue TCPOFODrop : Number of packets meant to be queued in OFO but dropped because socket rcvbuf limit hit. TCPOFOMerge : Number of packets in OFO that were merged with other packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 055ac49b8b40..cc4e12f1f2f7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4397,8 +4397,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (tcp_try_rmem_schedule(sk, skb->truesize)) { - /* TODO: should increment a counter */ + if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; } @@ -4407,6 +4407,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4460,6 +4461,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); @@ -4498,6 +4500,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb1); } -- cgit From 282f23c6ee343126156dd41218b22ece96d747e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 10:13:05 +0200 Subject: tcp: implement RFC 5961 3.2 Implement the RFC 5691 mitigation against Blind Reset attack using RST bit. Idea is to validate incoming RST sequence, to match RCV.NXT value, instead of previouly accepted window : (RCV.NXT <= SEG.SEQ < RCV.NXT+RCV.WND) If sequence is in window but not an exact match, send a "challenge ACK", so that the other part can resend an RST with the appropriate sequence. Add a new sysctl, tcp_challenge_ack_limit, to limit number of challenge ACK sent per second. Add a new SNMP counter to count number of challenge acks sent. (netstat -s | grep TCPChallengeACK) Signed-off-by: Eric Dumazet Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc4e12f1f2f7..c841a8990377 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -5247,6 +5250,23 @@ out: } #endif /* CONFIG_NET_DMA */ +static void tcp_send_challenge_ack(struct sock *sk) +{ + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 now = jiffies / HZ; + + if (now != challenge_timestamp) { + challenge_timestamp = now; + challenge_count = 0; + } + if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } +} + /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5283,7 +5303,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } -- cgit From 0c24604b68fc7810d429d6c3657b6f148270e528 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 01:41:30 +0000 Subject: tcp: implement RFC 5961 4.2 Implement the RFC 5691 mitigation against Blind Reset attack using SYN bit. Section 4.2 of RFC 5961 advises to send a Challenge ACK and drop incoming packet, instead of resetting the session. Add a new SNMP counter to count number of challenge acks sent in response to SYN packets. (netstat -s | grep TCPSYNChallenge) Remove obsolete TCPAbortOnSyn, since we no longer abort a TCP session because of a SYN flag. Signed-off-by: Eric Dumazet Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c841a8990377..8aaec5536111 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5270,8 +5270,8 @@ static void tcp_send_challenge_ack(struct sock *sk) /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); @@ -5323,20 +5323,22 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5366,7 +5368,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; if (sk->sk_rx_dst) { struct dst_entry *dst = sk->sk_rx_dst; @@ -5555,9 +5556,8 @@ slow_path: * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return 0; step5: if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) @@ -5877,7 +5877,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; - int res; tp->rx_opt.saw_tstamp = 0; @@ -5932,9 +5931,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ if (th->ack) { -- cgit From e371589917011efe6ff8c7dfb4e9e81934ac5855 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 12:29:30 +0000 Subject: tcp: refine SYN handling in tcp_validate_incoming Followup of commit 0c24604b68fc (tcp: implement RFC 5961 4.2) As reported by Vijay Subramanian, we should send a challenge ACK instead of a dup ack if a SYN flag is set on a packet received out of window. This permits the ratelimiting to work as intended, and to increase correct SNMP counters. Suggested-by: Vijay Subramanian Signed-off-by: Eric Dumazet Acked-by: Vijay Subramanian Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8aaec5536111..fdd49f1b7a52 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5296,8 +5296,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } @@ -5327,6 +5330,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * RFC 5691 4.2 : Send a challenge ack */ if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); -- cgit From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:05 +0000 Subject: net-tcp: Fast Open base This patch impelements the common code for both the client and server. 1. TCP Fast Open option processing. Since Fast Open does not have an option number assigned by IANA yet, it shares the experiment option code 254 by implementing draft-ietf-tcpm-experimental-options with a 16 bits magic number 0xF989. This enables global experiments without clashing the scarce(2) experimental options available for TCP. When the draft status becomes standard (maybe), the client should switch to the new option number assigned while the server supports both numbers for transistion. 2. The new sysctl tcp_fastopen 3. A place holder init function Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd49f1b7a52..a06bb8959e7e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3732,7 +3732,8 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab) + const u8 **hvpp, int estab, + struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o break; } break; - } + case TCPOPT_EXP: + /* Fast Open option shares code 254 using a + * 16 bits magic number. It's valid only in + * SYN or SYN-ACK with an even size. + */ + if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || + get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || + foc == NULL || !th->syn || (opsize & 1)) + break; + foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; + if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && + foc->len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, ptr + 2, foc->len); + else if (foc->len != 0) + foc->len = -1; + break; + + } ptr += opsize-2; length -= opsize; } @@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); return true; } @@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL); if (th->ack) { /* rfc793: -- cgit From 8e4178c1c7b52f7c99f5fd22ef7af6b2bff409e3 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:08 +0000 Subject: net-tcp: Fast Open client - receiving SYN-ACK On receiving the SYN-ACK after SYN-data, the client needs to a) update the cached MSS and cookie (if included in SYN-ACK) b) retransmit the data not yet acknowledged by the SYN-ACK in the final ACK of the handshake. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a06bb8959e7e..38b6a811edfc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5646,6 +5646,34 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) } } +static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, + struct tcp_fastopen_cookie *cookie) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *data = tcp_write_queue_head(sk); + u16 mss = tp->rx_opt.mss_clamp; + + if (mss == tp->rx_opt.user_mss) { + struct tcp_options_received opt; + const u8 *hash_location; + + /* Get original SYNACK MSS value if user MSS sets mss_clamp */ + tcp_clear_options(&opt); + opt.user_mss = opt.mss_clamp = 0; + tcp_parse_options(synack, &opt, &hash_location, 0, NULL); + mss = opt.mss_clamp; + } + + tcp_fastopen_cache_set(sk, mss, cookie); + + if (data) { /* Retransmit unacked data in SYN */ + tcp_retransmit_skb(sk, data); + tcp_rearm_rto(sk); + return true; + } + return false; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { @@ -5653,9 +5681,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_cookie_values *cvp = tp->cookie_values; + struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); if (th->ack) { /* rfc793: @@ -5665,11 +5694,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)" - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: */ - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) goto reset_and_undo; if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -5781,6 +5808,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); + if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc)) + return -1; + if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { -- cgit From aab4874355679c70f93993cf3b3fd74643b9ac33 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:10 +0000 Subject: net-tcp: Fast Open client - detecting SYN-data drops On paths with firewalls dropping SYN with data or experimental TCP options, Fast Open connections will have experience SYN timeout and bad performance. The solution is to track such incidents in the cookie cache and disables Fast Open temporarily. Since only the original SYN includes data and/or Fast Open option, the SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect such drops. If a path has recurring Fast Open SYN drops, Fast Open is disabled for 2^(recurring_losses) minutes starting from four minutes up to roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but it behaves as connect() then write(). Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38b6a811edfc..c49a4fc175bd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5652,6 +5652,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tcp_write_queue_head(sk); u16 mss = tp->rx_opt.mss_clamp; + bool syn_drop; if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; @@ -5664,7 +5665,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } - tcp_fastopen_cache_set(sk, mss, cookie); + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably + * the remote receives only the retransmitted (regular) SYNs: either + * the original SYN-data or the corresponding SYN-ACK is lost. + */ + syn_drop = (cookie->len <= 0 && data && + inet_csk(sk)->icsk_retransmits); + + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); if (data) { /* Retransmit unacked data in SYN */ tcp_retransmit_skb(sk, data); -- cgit From 67da22d23fa6f3324e03bcd0580b914b2e4afbf3 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:11 +0000 Subject: net-tcp: Fast Open client - cookie-less mode In trusted networks, e.g., intranet, data-center, the client does not need to use Fast Open cookie to mitigate DoS attacks. In cookie-less mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless of cookie availability. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c49a4fc175bd..e67d685a6c0e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5650,7 +5650,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tcp_write_queue_head(sk); + struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp; bool syn_drop; @@ -5665,6 +5665,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } + if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + cookie->len = -1; + /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably * the remote receives only the retransmitted (regular) SYNs: either * the original SYN-data or the corresponding SYN-ACK is lost. @@ -5816,7 +5819,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); - if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc)) + if ((tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc)) return -1; if (sk->sk_write_pending || -- cgit From 67b95bd78f0de85793bf30835913f6ef784a39b6 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Thu, 19 Jul 2012 21:32:18 +0000 Subject: tcp: Return bool instead of int where appropriate Applied to a set of static inline functions in tcp_input.c Signed-off-by: Vijay Subramanian Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e67d685a6c0e..21d7f8f3a7a5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2521,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ -static inline int tcp_packet_delayed(const struct tcp_sock *tp) +static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2582,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_may_undo(const struct tcp_sock *tp) +static inline bool tcp_may_undo(const struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } @@ -3371,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) { return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } -static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && @@ -3387,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, +static inline bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { @@ -4006,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, +static inline bool tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4028,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); @@ -5214,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, +static inline bool tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && -- cgit From 92101b3b2e3178087127709a556b091dae314e9e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:29:00 -0700 Subject: ipv4: Prepare for change of rt->rt_iif encoding. Use inet_iif() consistently, and for TCP record the input interface of cached RX dst in inet sock. rt->rt_iif is going to be encoded differently, so that we can legitimately cache input routes in the FIB info more aggressively. When the input interface is "use SKB device index" the rt->rt_iif will be set to zero. This forces us to move the TCP RX dst cache installation into the ipv4 specific code, and as well it should since doing the route caching for ipv6 is pointless at the moment since it is not inspected in the ipv6 input paths yet. Also, remove the unlikely on dst->obsolete, all ipv4 dsts have obsolete set to a non-zero value to force invocation of the check callback. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21d7f8f3a7a5..3e07a64ca44e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,18 +5391,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; - if (unlikely(dst->obsolete)) { - if (dst->ops->check(dst, 0) == NULL) { - dst_release(dst); - sk->sk_rx_dst = NULL; - } - } - } - if (unlikely(sk->sk_rx_dst == NULL)) - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - /* * Header prediction. * The code loosely follows the one in the famous -- cgit