summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-12-19 17:28:51 -0800
committerJakub Kicinski <kuba@kernel.org>2022-12-19 17:28:52 -0800
commit918fb1aaa25812a277ab469679df17f45ce92313 (patch)
treededc1398193582d8d52ad9142d95f5b16bd5d50c /include
parentb389a902dd5be4ece505a2e0463b9b034de04bf5 (diff)
parent08f65892c5ee15806dce7259e06c384b8cd768d7 (diff)
Merge branch 'stop-corrupting-socket-s-task_frag'
Benjamin Coddington says: ==================== Stop corrupting socket's task_frag The networking code uses flags in sk_allocation to determine if it can use current->task_frag, however in-kernel users of sockets may stop setting sk_allocation when they convert to the preferred memalloc_nofs_save/restore, as SUNRPC has done in commit a1231fda7e94 ("SUNRPC: Set memalloc_nofs_save() on all rpciod/xprtiod jobs"). This will cause corruption in current->task_frag when recursing into the network layer for those subsystems during page fault or reclaim. The corruption is difficult to diagnose because stack traces may not contain the offending subsystem at all. The corruption is unlikely to show up in testing because it requires memory pressure, and so subsystems that convert to memalloc_nofs_save/restore are likely to continue to run into this issue. Previous reports and proposed fixes: https://lore.kernel.org/netdev/96a18bd00cbc6cb554603cc0d6ef1c551965b078.1663762494.git.gnault@redhat.com/ https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/ https://lore.kernel.org/linux-nfs/de6d99321d1dcaa2ad456b92b3680aa77c07a747.1665401788.git.gnault@redhat.com/ Guilluame Nault has done all of the hard work tracking this problem down and finding the best fix for this issue. I'm just taking a turn posting another fix. ==================== Link: https://lore.kernel.org/r/cover.1671194454.git.bcodding@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/net/sock.h10
1 files changed, 6 insertions, 4 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index ecea3dcc2217..dcd72e6285b2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -318,6 +318,9 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
+ * Sockets that can be used under memory reclaim should
+ * set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
@@ -512,6 +515,7 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+ bool sk_use_task_frag;
struct socket *sk_socket;
void *sk_user_data;
@@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
* Both direct reclaim and page faults can nest inside other
* socket operations and end up recursing into sk_page_frag()
* while it's already in use: explicitly avoid task page_frag
- * usage if the caller is potentially doing any of them.
- * This assumes that page fault handlers use the GFP_NOFS flags.
+ * when users disable sk_use_task_frag.
*
* Return: a per task page_frag if context allows that,
* otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
- if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
- (__GFP_DIRECT_RECLAIM | __GFP_FS))
+ if (sk->sk_use_task_frag)
return &current->task_frag;
return &sk->sk_frag;