diff options
Diffstat (limited to 'net')
| -rw-r--r-- | net/ipv4/Makefile | 2 | ||||
| -rw-r--r-- | net/ipv4/proc.c | 1 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 43 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 5 | ||||
| -rw-r--r-- | net/ipv4/tcp_dctcp.c | 23 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 8 | ||||
| -rw-r--r-- | net/ipv4/tcp_plb.c | 109 |
7 files changed, 189 insertions, 2 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bbdd9c44f14e..af7d2cf490fb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_rate.o tcp_recovery.o tcp_ulp.o \ - tcp_offload.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 5386f460bd20..f88daace9de3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), + SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9b8a6db7a66b..0af28cedd071 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -40,6 +40,8 @@ static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; +static int tcp_plb_max_rounds = 31; +static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1384,6 +1386,47 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_plb_enabled", + .data = &init_net.ipv4.sysctl_tcp_plb_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "tcp_plb_idle_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_suspend_rto_sec", + .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "tcp_plb_cong_thresh", + .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..de8f0cd7cb32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rack.mstamp = 0; @@ -3939,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } @@ -3973,6 +3976,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -4049,6 +4053,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 2a6c0dd665a4..e0a2ca7456ff 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -54,6 +54,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 loss_cwnd; + struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -91,6 +92,8 @@ static void dctcp_init(struct sock *sk) ca->ce_state = 0; dctcp_reset(tp, ca); + tcp_plb_init(sk, &ca->plb); + return; } @@ -117,14 +120,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { + u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; + u32 ce_ratio = 0; + + if (delivered > 0) { + /* dctcp_alpha keeps EWMA of fraction of ECN marked + * packets. Because of EWMA smoothing, PLB reaction can + * be slow so we use ce_ratio which is an instantaneous + * measure of congestion. ce_ratio is the fraction of + * ECN marked packets in the previous RTT. + */ + if (delivered_ce > 0) + ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; + tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); + tcp_plb_check_rehash(sk, &ca->plb); + } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { - u32 delivered = tp->delivered - ca->old_delivered; /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. @@ -172,8 +189,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: + tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; + case CA_EVENT_TX_START: + tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ + break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..ebab9e8b184c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3218,6 +3218,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Set default values for PLB */ + net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; + net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; + net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; + /* Default congestion threshold for PLB to mark a round is 50% */ + net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; + /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c new file mode 100644 index 000000000000..bb1a08fda113 --- /dev/null +++ b/net/ipv4/tcp_plb.c @@ -0,0 +1,109 @@ +/* Protective Load Balancing (PLB) + * + * PLB was designed to reduce link load imbalance across datacenter + * switches. PLB is a host-based optimization; it leverages congestion + * signals from the transport layer to randomly change the path of the + * connection experiencing sustained congestion. PLB prefers to repath + * after idle periods to minimize packet reordering. It repaths by + * changing the IPv6 Flow Label on the packets of a connection, which + * datacenter switches include as part of ECMP/WCMP hashing. + * + * PLB is described in detail in: + * + * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, + * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, + * David Wetherall,Abdul Kabbani: + * "PLB: Congestion Signals are Simple and Effective for + * Network Load Balancing" + * In ACM SIGCOMM 2022, Amsterdam Netherlands. + * + */ + +#include <net/tcp.h> + +/* Called once per round-trip to update PLB state for a connection. */ +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio) +{ + struct net *net = sock_net(sk); + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + if (cong_ratio >= 0) { + if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) + plb->consec_cong_rounds = 0; + else if (plb->consec_cong_rounds < + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) + plb->consec_cong_rounds++; + } +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state); + +/* Check whether recent congestion has been persistent enough to warrant + * a load balancing decision that switches the connection to another path. + */ +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 max_suspend; + bool forced_rehash = false, idle_rehash = false; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + forced_rehash = plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); + /* If sender goes idle then we check whether to rehash. */ + idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && + !tcp_sk(sk)->packets_out && + plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); + + if (!forced_rehash && !idle_rehash) + return; + + /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for + * cases where the max suspension end is before the actual suspension + * end. We clear pause_until to 0 to indicate there is no recent + * RTO event that constrains PLB rehashing. + */ + max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + if (plb->pause_until && + (!before(tcp_jiffies32, plb->pause_until) || + before(tcp_jiffies32 + max_suspend, plb->pause_until))) + plb->pause_until = 0; + + if (plb->pause_until) + return; + + sk_rethink_txhash(sk); + plb->consec_cong_rounds = 0; + tcp_sk(sk)->plb_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); +} +EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); + +/* Upon RTO, disallow load balancing for a while, to avoid having load + * balancing decisions switch traffic to a black-holed path that was + * previously avoided with a sk_rethink_txhash() call at RTO time. + */ +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 pause; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + pause += prandom_u32_max(pause); + plb->pause_until = tcp_jiffies32 + pause; + + /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call + * that may switch this connection to a path with completely different + * congestion characteristics. + */ + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); |
