diff options
author | Jakub Kicinski <kuba@kernel.org> | 2025-03-04 17:46:30 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2025-03-04 17:46:30 -0800 |
commit | 85f66df39bcfc0b35f0b068f281b17c8811342c5 (patch) | |
tree | d798c722eef0e5158040b5825250f3053d183384 | |
parent | 7ff1c88fc89688c27f773ba956f65f0c11367269 (diff) | |
parent | 86c2bc293b8130aec9fa504e953531a84a6eb9a6 (diff) |
Merge branch 'tcp-scale-connect-under-pressure'
Eric Dumazet says:
====================
tcp: scale connect() under pressure
Adoption of bhash2 in linux-6.1 made some operations almost twice
more expensive, because of additional locks.
This series adds RCU in __inet_hash_connect() to help the
case where many attempts need to be made before finding
an available 4-tuple.
This brings a ~200 % improvement in this experiment:
Server:
ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
Client:
ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
Before series:
utime_start=0.288582
utime_end=1.548707
stime_start=20.637138
stime_end=2002.489845
num_transactions=484453
latency_min=0.156279245
latency_max=20.922042756
latency_mean=1.546521274
latency_stddev=3.936005194
num_samples=312537
throughput=47426.00
perf top on the client:
49.54% [kernel] [k] _raw_spin_lock
25.87% [kernel] [k] _raw_spin_lock_bh
5.97% [kernel] [k] queued_spin_lock_slowpath
5.67% [kernel] [k] __inet_hash_connect
3.53% [kernel] [k] __inet6_check_established
3.48% [kernel] [k] inet6_ehashfn
0.64% [kernel] [k] rcu_all_qs
After this series:
utime_start=0.271607
utime_end=3.847111
stime_start=18.407684
stime_end=1997.485557
num_transactions=1350742
latency_min=0.014131929
latency_max=17.895073144
latency_mean=0.505675853 # Nice reduction of latency metrics
latency_stddev=2.125164772
num_samples=307884
throughput=139866.80 # 194 % increase
perf top on client:
56.86% [kernel] [k] __inet6_check_established
17.96% [kernel] [k] __inet_hash_connect
13.88% [kernel] [k] inet6_ehashfn
2.52% [kernel] [k] rcu_all_qs
2.01% [kernel] [k] __cond_resched
0.41% [kernel] [k] _raw_spin_lock
====================
Link: https://patch.msgid.link/20250302124237.3913746-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | include/net/inet_hashtables.h | 7 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 8 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 65 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 2 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 23 |
5 files changed, 75 insertions, 30 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index da818fb0205f..f447d61d9598 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; + struct rcu_head rcu; }; struct inet_bind2_bucket { @@ -226,8 +227,7 @@ struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, @@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bf9ce0c19657..e93c66034077 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - - if (addr_type == IPV6_ADDR_ANY) + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return false; - if (addr_type != IPV6_ADDR_MAPPED) + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return true; } #endif @@ -600,7 +598,7 @@ fail_unlock: if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..d1b5f45ee718 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); - hlist_add_head(&tb->node, &head->chain); + hlist_add_head_rcu(&tb->node, &head->chain); } return tb; } @@ -84,11 +84,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, /* * Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); + hlist_del_rcu(&tb->node); + kfree_rcu(tb, rcu); } } @@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); } @@ -285,7 +285,7 @@ bhash2_find: error: if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; @@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -551,11 +552,24 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -994,7 +1008,8 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **)) + struct sock *, __u16, struct inet_timewait_sock **, + bool rcu_lookup)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1012,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL); + ret = check_established(death_row, sk, port, NULL, false); local_bh_enable(); return ret; } @@ -1048,6 +1063,21 @@ other_parity_scan: continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; + rcu_read_lock(); + hlist_for_each_entry_rcu(tb, &head->chain, node) { + if (!inet_bind_bucket_match(tb, net, port, l3mdev)) + continue; + if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { + rcu_read_unlock(); + goto next_port; + } + if (!check_established(death_row, sk, port, &tw, true)) + break; + rcu_read_unlock(); + goto next_port; + } + rcu_read_unlock(); + spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because @@ -1057,12 +1087,12 @@ other_parity_scan: if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) - goto next_port; + goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw)) + port, &tw, false)) goto ok; - goto next_port; + goto next_port_unlock; } } @@ -1076,8 +1106,9 @@ other_parity_scan: tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; -next_port: +next_port_unlock: spin_unlock_bh(&head->lock); +next_port: cond_resched(); } @@ -1149,7 +1180,7 @@ error: spin_unlock(&head2->lock); if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); if (tw) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 337390ba85b4..aded4bf1bc16 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 9ec05e354baa..9be315496459 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -276,11 +277,25 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { |