diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2024-12-11 20:17:38 -0800 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2024-12-11 20:17:38 -0800 |
| commit | 154dee7c3265bb8c1e9e87ee63dd195497155854 (patch) | |
| tree | aca31d37f54235e7af3d4ae4fad6128fbe71f709 /net | |
| parent | 148328b59d4b37690b6a06a2e8a0a3f22b7c4aa8 (diff) | |
| parent | ca6a6f93867a9763bdf8685c788e2e558d10975f (diff) | |
Merge branch 'make-time-wait-reuse-delay-deterministic-and-configurable'
Jakub Sitnicki says:
====================
Make TIME-WAIT reuse delay deterministic and configurable
This patch set is an effort to enable faster reuse of TIME-WAIT sockets.
We have recently talked about the motivation and the idea at Plumbers [1].
Experiment in production
------------------------
We are restarting our experiment on a small set of production nodes as the
code has slightly changed since v1 [2], and there are still a few weeks of
development window to soak the changes. We will report back if we observe
any regressions.
Packetdrill tests
-----------------
The packetdrill tests for TIME-WAIT reuse [3] did not change since v1.
Although we are not touching PAWS code any more, I would still like to add
tests to cover PAWS reject after TW reuse. This, however, requires patching
packetdrill as I mentioned in the last cover letter [2].
[1] https://lpc.events/event/18/contributions/1962/
[2] https://lore.kernel.org/r/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
[3] https://github.com/google/packetdrill/pull/90
v1: https://lore.kernel.org/20241204-jakub-krn-909-poc-msec-tw-tstamp-v1-0-8b54467a0f34@cloudflare.com
RFCv2: https://lore.kernel.org/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
RFCv1: https://lore.kernel.org/20240819-jakub-krn-909-poc-msec-tw-tstamp-v1-1-6567b5006fbe@cloudflare.com
====================
Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-0-66aca0eed03e@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 10 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 7 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 7 |
3 files changed, 21 insertions, 3 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a79b2a52ce01..42cb5dc9cb24 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; +static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1066,6 +1067,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_tw_reuse_delay", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &tcp_tw_reuse_delay_max, + }, + { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..e45222d5fc2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + u32 reuse_thresh; if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) reuse = 0; @@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); if (ts_recent_stamp && - (!twp || (reuse && time_after32(ktime_get_seconds(), - ts_recent_stamp)))) { + (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ @@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7121d8573928..b089b08e9617 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, rcv_nxt); if (tmp_opt.saw_tstamp) { + u64 ts = tcp_clock_ms(); + + WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, - ktime_get_seconds()); + div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } @@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + /* refreshed when we enter true TIME-WAIT state */ + tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); |
