diff options
Diffstat (limited to 'fs')
104 files changed, 1801 insertions, 1639 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 1286d96a29bc..862164181bac 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); if (len > 0) __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_write_subrequest_terminated(subreq, len ?: err, false); + netfs_write_subrequest_terminated(subreq, len ?: err); } /** @@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 45cee6534122..9434a5399f2b 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -189,7 +189,6 @@ struct vfsmount *afs_d_automount(struct path *path) if (IS_ERR(newmnt)) return newmnt; - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); diff --git a/fs/afs/write.c b/fs/afs/write.c index 18b0a9f1615e..2e7526ea883a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work) #if 0 // Error injection if (subreq->debug_index == 3) - return netfs_write_subrequest_terminated(subreq, -ENOANO, false); + return netfs_write_subrequest_terminated(subreq, -ENOANO); if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); } #endif op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); if (IS_ERR(op)) - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; @@ -166,7 +166,7 @@ static void afs_issue_write_worker(struct work_struct *work) break; } - netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len); } void afs_issue_write(struct netfs_io_subrequest *subreq) @@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st case NETFS_READ_GAPS: case NETFS_READ_SINGLE: case NETFS_READ_FOR_WRITE: + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: return; default: diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 25cf61ebd40c..0a4b1d433621 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -672,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes) u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -#define cmp_int(l, r) ((l > r) - (l < r)) - static inline int u8_cmp(u8 l, u8 r) { return cmp_int(l, r); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 92058ae43488..c08e4a66ac07 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret) ret = -ESTALE; } - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); } cachefiles_put_kiocb(ki); @@ -188,7 +188,7 @@ in_progress: presubmission_error: if (term_func) - term_func(term_func_priv, ret < 0 ? ret : skipped, false); + term_func(term_func_priv, ret < 0 ? ret : skipped); return ret; } @@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); if (ki->term_func) - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); cachefiles_put_kiocb(ki); } @@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object, ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) { if (term_func) - term_func(term_func_priv, -ENOMEM, false); + term_func(term_func_priv, -ENOMEM); return -ENOMEM; } @@ -366,7 +366,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { if (term_func) - term_func(term_func_priv, -ENOBUFS, false); + term_func(term_func_priv, -ENOBUFS); trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite); return -ENOBUFS; } @@ -665,7 +665,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) pre = CACHEFILES_DIO_BLOCK_SIZE - off; if (pre >= len) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, len, false); + netfs_write_subrequest_terminated(subreq, len); return; } subreq->transferred += pre; @@ -691,7 +691,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) len -= post; if (len == 0) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, post, false); + netfs_write_subrequest_terminated(subreq, post); return; } iov_iter_truncate(&subreq->io_iter, len); @@ -703,7 +703,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) &start, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) { - netfs_write_subrequest_terminated(subreq, ret, false); + netfs_write_subrequest_terminated(subreq, ret); return; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 29be367905a1..b95c4cb21c13 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -238,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); if (err < subreq->len && + subreq->rreq->origin != NETFS_UNBUFFERED_READ && subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { @@ -281,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) size_t len; int mode; - if (rreq->origin != NETFS_DIO_READ) + if (rreq->origin != NETFS_UNBUFFERED_READ && + rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -539,7 +541,7 @@ static void ceph_set_page_fscache(struct page *page) folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } -static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) +static void ceph_fscache_write_terminated(void *priv, ssize_t error) { struct inode *inode = priv; diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 272b64456999..1fcd761fe7be 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - select SYSFS help configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based @@ -257,7 +257,7 @@ static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xas_pause(xas); + xas_reset(xas); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); @@ -1422,8 +1422,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } - pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); + pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 9c9129bca346..34517ca9df91 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) erofs_fscache_req_put(req); } -static void erofs_fscache_req_end_io(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_io *io = priv; struct erofs_fscache_rq *req = io->private; @@ -180,8 +179,7 @@ struct erofs_fscache_bio { struct bio_vec bvecs[BIO_MAX_VECS]; }; -static void erofs_fscache_bio_endio(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_bio *io = priv; diff --git a/fs/exec.c b/fs/exec.c index cfbb2b9ee3c9..1f5fdd2e096e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -78,6 +78,9 @@ #include <trace/events/sched.h> +/* For vma exec functions. */ +#include "../mm/internal.h" + static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; @@ -182,60 +185,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - int err; - struct vm_area_struct *vma = NULL; - struct mm_struct *mm = bprm->mm; - - bprm->vma = vma = vm_area_alloc(mm); - if (!vma) - return -ENOMEM; - vma_set_anonymous(vma); - - if (mmap_write_lock_killable(mm)) { - err = -EINTR; - goto err_free; - } - - /* - * Need to be called with mmap write lock - * held, to avoid race with ksmd. - */ - err = ksm_execve(mm); - if (err) - goto err_ksm; - - /* - * Place the stack at the largest stack address the architecture - * supports. Later, we'll move this to an appropriate place. We don't - * use STACK_TOP because that can depend on attributes which aren't - * configured yet. - */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); - vma->vm_end = STACK_TOP_MAX; - vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - - err = insert_vm_struct(mm, vma); - if (err) - goto err; - - mm->stack_vm = mm->total_vm = 1; - mmap_write_unlock(mm); - bprm->p = vma->vm_end - sizeof(void *); - return 0; -err: - ksm_exit(mm); -err_ksm: - mmap_write_unlock(mm); -err_free: - bprm->vma = NULL; - vm_area_free(vma); - return err; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; @@ -288,12 +237,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, { } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); - return 0; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; @@ -322,9 +265,13 @@ static int bprm_mm_init(struct linux_binprm *bprm) bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); - err = __bprm_mm_init(bprm); +#ifndef CONFIG_MMU + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); +#else + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); if (err) goto err; +#endif return 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6dcbaa218b7a..e80cd8f2c049 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/splice.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -45,7 +46,7 @@ bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); } -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) { int i; @@ -816,7 +817,7 @@ static int unlock_request(struct fuse_req *req) return err; } -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); @@ -955,10 +956,10 @@ static int fuse_check_folio(struct folio *folio) * folio that was originally in @pagep will lose a reference and the new * folio returned in @pagep will carry a reference. */ -static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop) { int err; - struct folio *oldfolio = page_folio(*pagep); + struct folio *oldfolio = *foliop; struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; @@ -979,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) cs->pipebufs++; cs->nr_segs--; - if (cs->len != PAGE_SIZE) + if (cs->len != folio_size(oldfolio)) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) @@ -1025,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = &newfolio->page; + *foliop = newfolio; spin_unlock(&cs->req->waitq.lock); if (err) { @@ -1058,8 +1059,8 @@ out_fallback: goto out_put_old; } -static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, - unsigned offset, unsigned count) +static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, + unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; @@ -1067,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; - get_page(page); + folio_get(folio); err = unlock_request(cs->req); if (err) { - put_page(page); + folio_put(folio); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; - buf->page = page; + buf->page = &folio->page; buf->offset = offset; buf->len = count; @@ -1089,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, } /* - * Copy a page in the request to/from the userspace buffer. Must be + * Copy a folio in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, - unsigned offset, unsigned count, int zeroing) +static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop, + unsigned offset, unsigned count, int zeroing) { int err; - struct page *page = *pagep; + struct folio *folio = *foliop; + size_t size; - if (page && zeroing && count < PAGE_SIZE) - clear_highpage(page); + if (folio) { + size = folio_size(folio); + if (zeroing && count < size) + folio_zero_range(folio, 0, size); + } while (count) { - if (cs->write && cs->pipebufs && page) { + if (cs->write && cs->pipebufs && folio) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. @@ -1112,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (err) return err; } else { - return fuse_ref_page(cs, page, offset, count); + return fuse_ref_folio(cs, folio, offset, count); } } else if (!cs->len) { - if (cs->move_pages && page && - offset == 0 && count == PAGE_SIZE) { - err = fuse_try_move_page(cs, pagep); + if (cs->move_folios && folio && + offset == 0 && count == size) { + err = fuse_try_move_folio(cs, foliop); if (err <= 0) return err; } else { @@ -1126,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, return err; } } - if (page) { - void *mapaddr = kmap_local_page(page); - void *buf = mapaddr + offset; - offset += fuse_copy_do(cs, &buf, &count); + if (folio) { + void *mapaddr = kmap_local_folio(folio, offset); + void *buf = mapaddr; + unsigned int copy = count; + unsigned int bytes_copied; + + if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset)) + copy = PAGE_SIZE - offset_in_page(offset); + + bytes_copied = fuse_copy_do(cs, &buf, ©); kunmap_local(mapaddr); + offset += bytes_copied; + count -= bytes_copied; } else offset += fuse_copy_do(cs, NULL, &count); } - if (page && !cs->write) - flush_dcache_page(page); + if (folio && !cs->write) + flush_dcache_folio(folio); return 0; } -/* Copy pages in the request to/from userspace buffer */ -static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, - int zeroing) +/* Copy folios in the request to/from userspace buffer */ +static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) { unsigned i; struct fuse_req *req = cs->req; @@ -1151,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); - struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; - - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing); if (err) return err; nbytes -= count; - - /* - * fuse_copy_page may have moved a page from a pipe instead of - * copying into our given page, so update the folios if it was - * replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); } return 0; } @@ -1197,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); + err = fuse_copy_folios(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } @@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!user_backed_iter(to)) return -EINVAL; - fuse_copy_init(&cs, 1, to); + fuse_copy_init(&cs, true, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } @@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, 1, NULL); + fuse_copy_init(&cs, true, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); @@ -1786,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - struct page *page; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_grab_folio(mapping, index); err = PTR_ERR(folio); if (IS_ERR(folio)) goto out_iput; - page = &folio->page; - this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_page(cs, &page, offset, this_num, 0); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + + err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && - (this_num == folio_size(folio) || file_size == end)) { - folio_zero_segment(folio, this_num, folio_size(folio)); + (nr_bytes == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, nr_bytes, folio_size(folio)); folio_mark_uptodate(folio); } folio_unlock(folio); @@ -1808,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (err) goto out_iput; - num -= this_num; + num -= nr_bytes; offset = 0; - index++; + index += nr_pages; } err = 0; @@ -1849,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages, cur_pages = 0; + unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1867,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); + num = min(num, num_pages << PAGE_SHIFT); args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); @@ -1887,25 +1893,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && cur_pages < num_pages) { + while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min(folio_size(folio) - folio_offset, num); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].offset = offset; - ap->descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = nr_bytes; ap->num_folios++; - cur_pages++; offset = 0; - num -= this_num; - total_len += this_num; - index++; + num -= nr_bytes; + total_len += nr_bytes; + index += nr_pages; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; @@ -2021,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { - /* Don't try to move pages (yet) */ - cs->move_pages = 0; + /* Don't try to move folios (yet) */ + cs->move_folios = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -2049,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2173,7 +2199,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = 0; + cs->move_folios = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; @@ -2211,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!user_backed_iter(from)) return -EINVAL; - fuse_copy_init(&cs, 0, from); + fuse_copy_init(&cs, false, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } @@ -2285,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, 0, NULL); + fuse_copy_init(&cs, false, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = 1; + cs.move_folios = true; ret = fuse_dev_do_write(fud, &cs, len); @@ -2602,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2617,6 +2654,9 @@ const struct file_operations fuse_dev_operations = { #ifdef CONFIG_FUSE_IO_URING .uring_cmd = fuse_uring_cmd, #endif +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index accdce2977c5..249b210becb1 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_ring_ent *ent; + struct fuse_req *req; + + ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); + if (!ent) + return false; + + req = ent->fuse_req; + + return time_is_before_jiffies(req->create_time + + fc->timeout.req_timeout); +} + bool fuse_uring_request_expired(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -157,7 +172,8 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) spin_lock(&queue->lock); if (fuse_request_expired(fc, &queue->fuse_req_queue) || fuse_request_expired(fc, &queue->fuse_req_bg_queue) || - fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + ent_list_request_expired(fc, &queue->ent_w_req_queue) || + ent_list_request_expired(fc, &queue->ent_in_userspace)) { spin_unlock(&queue->lock); return true; } @@ -494,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); need_cmd_done = true; ent->cmd = NULL; } @@ -577,8 +593,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (err) return err; - fuse_copy_init(&cs, 0, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, false, &iter); + cs.is_uring = true; cs.req = req; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); @@ -607,8 +623,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, return err; } - fuse_copy_init(&cs, 1, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, true, &iter); + cs.is_uring = true; cs.req = req; if (num_args > 0) { @@ -714,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, cmd = ent->cmd; ent->cmd = NULL; ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); io_uring_cmd_done(cmd, 0, 0, issue_flags); @@ -764,7 +780,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, clear_bit(FR_PENDING, &req->flags); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; - list_move(&ent->list, &queue->ent_w_req_queue); + list_move_tail(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); } @@ -1180,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); ent->cmd = NULL; spin_unlock(&queue->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 33b82529cb6e..45b4c3cc1396 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, { struct inode *inode; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(dir->i_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -319,9 +324,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) /* Create the submount */ mnt = fc_mount(fsc); - if (!IS_ERR(mnt)) - mntget(mnt); - put_fs_context(fsc); return mnt; } @@ -415,16 +417,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -446,6 +452,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -619,7 +626,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -629,11 +635,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -702,6 +710,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -788,12 +797,14 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + epoch = atomic_read(&fm->fc->epoch); + forget = fuse_alloc_forget(); if (!forget) return ERR_PTR(-ENOMEM); @@ -835,10 +846,13 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun if (IS_ERR(d)) return d; - if (d) + if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); - else + } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); + } fuse_dir_changed(dir); return d; @@ -1612,10 +1626,10 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct folio *folio) +static int fuse_readlink_folio(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, @@ -1670,7 +1684,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!folio) goto out_err; - err = fuse_readlink_page(inode, folio); + err = fuse_readlink_folio(inode, folio); if (err) { folio_put(folio); goto out_err; @@ -1946,6 +1960,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -2030,6 +2045,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -2055,6 +2072,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); @@ -2260,7 +2285,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, folio); + int err = fuse_readlink_folio(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 754378dd9f71..f102afc03359 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -532,10 +454,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -875,7 +793,7 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_folio_desc desc = { .length = folio_size(folio) }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -886,13 +804,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -965,14 +876,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); - /* Currently, all folios in FUSE are one page */ - size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -1005,17 +915,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; + struct folio *folio = NULL; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1033,8 +939,8 @@ static void fuse_readahead(struct readahead_control *rac) while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; - struct folio *folio; unsigned cur_pages = min(max_pages, nr_pages); + unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1046,10 +952,12 @@ static void fuse_readahead(struct readahead_control *rac) ia = fuse_io_alloc(NULL, cur_pages); if (!ia) - return; + break; ap = &ia->ap; - while (ap->num_folios < cur_pages) { + while (pages < cur_pages) { + unsigned int folio_pages; + /* * This returns a folio with a ref held on it. * The ref needs to be held until the request is @@ -1057,13 +965,31 @@ static void fuse_readahead(struct readahead_control *rac) * fuse_try_move_page()) drops the ref after it's * replaced in the page cache. */ - folio = __readahead_folio(rac); + if (!folio) + folio = __readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + if (folio_pages > cur_pages - pages) { + /* + * Large folios belonging to fuse will never + * have more pages than max_pages. + */ + WARN_ON(!pages); + break; + } + ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; + pages += folio_pages; + folio = NULL; } - fuse_send_readpages(ia, rac->file); - nr_pages -= cur_pages; + fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); + nr_pages -= pages; + } + if (folio) { + folio_end_read(folio, false); + folio_put(folio); } } @@ -1181,7 +1107,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1226,27 +1152,24 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); - unsigned int nr_pages = 0; size_t count = 0; - int err; + unsigned int num; + int err = 0; + + num = min(iov_iter_count(ii), fc->max_write); + num = min(num, max_pages << PAGE_SHIFT); ap->args.in_pages = true; ap->descs[0].offset = offset; - do { + while (num) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned int bytes; + unsigned int folio_offset; again: - err = -EFAULT; - if (fault_in_iov_iter_readable(ii, bytes)) - break; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { @@ -1257,29 +1180,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); - tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); + + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { folio_unlock(folio); folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } - err = 0; ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; - nr_pages++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - /* If we copied full page, mark it uptodate */ - if (tmp == PAGE_SIZE) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { @@ -1288,10 +1224,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, ia->write.folio_locked = true; break; } - if (!fc->big_writes) + if (!fc->big_writes || offset != 0) break; - } while (iov_iter_count(ii) && count < fc->max_write && - nr_pages < max_pages && offset == 0); + } return count > 0 ? count : err; } @@ -1638,7 +1573,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1835,38 +1770,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + folio_end_writeback(ap->folios[i]); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1877,13 +1808,15 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_args *args = &wpa->ia.ap.args; - /* Currently, all folios in FUSE are one page */ - __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; - int err; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; + + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { @@ -1914,19 +1847,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1954,43 +1876,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2010,41 +1895,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2131,19 +1981,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; + ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = PAGE_SIZE; + ap->descs[folio_index].length = folio_size(folio); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2178,18 +2025,12 @@ static int fuse_writepage_locked(struct folio *folio) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct folio *tmp_folio; struct fuse_file *ff; - int error = -ENOMEM; + int error = -EIO; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; ff = fuse_write_file_get(fi); if (!ff) - goto err_nofile; + goto err; wpa = fuse_writepage_args_setup(folio, ff); error = -ENOMEM; @@ -2200,22 +2041,17 @@ static int fuse_writepage_locked(struct folio *folio) ap->num_folios = 1; folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); + fuse_writepage_args_page_fill(wpa, folio, 0); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - folio_end_writeback(folio); - return 0; err_writepage_args: fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); err: mapping_set_error(folio->mapping, error); return error; @@ -2225,8 +2061,8 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; + unsigned int nr_pages; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) @@ -2260,69 +2096,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, @@ -2331,25 +2109,16 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, { WARN_ON(!ap->num_folios); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; - /* Reached max pages */ - if (ap->num_folios == fc->max_pages) + if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) + if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write) return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2368,7 +2137,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; int err; if (!data->ff) { @@ -2381,56 +2149,27 @@ static int fuse_writepages_fill(struct folio *folio, if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; + data->nr_pages = 0; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); + if (!wpa) goto out_unlock; - } fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); + data->nr_pages += folio_nr_pages(folio); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); - } out_unlock: folio_unlock(folio); @@ -2456,13 +2195,7 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; data.ff = NULL; - - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; + data.nr_pages = 0; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { @@ -2472,7 +2205,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_folios); out: return err; } @@ -2497,8 +2229,6 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, folio->index); - if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* @@ -2561,13 +2291,9 @@ static int fuse_launder_folio(struct folio *folio) { int err = 0; if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; - - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); err = fuse_writepage_locked(folio); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2611,7 +2337,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3429,9 +3155,12 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3439,7 +3168,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b3c2e32254ba..5a9bd771a319 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -20,7 +20,6 @@ struct fuse_iqueue; struct fuse_forget_link; struct fuse_copy_state { - int write; struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; @@ -30,8 +29,9 @@ struct fuse_copy_state { struct page *pg; unsigned int len; unsigned int offset; - unsigned int move_pages:1; - unsigned int is_uring:1; + bool write:1; + bool move_folios:1; + bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ } ring; @@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); void fuse_dev_end_requests(struct list_head *head); -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, @@ -64,7 +64,6 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d56d4fd956db..b54f4f57789f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -74,8 +74,8 @@ extern struct list_head fuse_conn_list; extern struct mutex fuse_mutex; /** Module parameters */ -extern unsigned max_user_bgreq; -extern unsigned max_user_congthresh; +extern unsigned int max_user_bgreq; +extern unsigned int max_user_congthresh; /* One forget request */ struct fuse_forget_link { @@ -161,9 +161,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -636,6 +633,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd48e8d37f2e..bfe8d8af46f3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -41,7 +41,7 @@ unsigned int fuse_max_pages_limit = 256; unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; -unsigned max_user_bgreq; +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -49,7 +49,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -962,6 +962,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1036,7 +1037,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -1211,7 +1212,7 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { /* * The default maximum number of async requests is calculated to consume @@ -1232,7 +1233,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index edcd6f18a8a8..c2aae2eef086 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ retry: } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); @@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT); u64 attr_version = 0, evict_ctr = 0; bool locked; - folio = folio_alloc(GFP_KERNEL, 0); - if (!folio) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_folios = 1; - ap->folios = &folio; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(folio_address(folio), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - folio_put(folio); + kvfree(buf); fuse_invalidate_atime(inode); return res; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 653f0ff4b057..85c491fcf1a3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -64,7 +64,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt) void free_sbd(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; + free_percpu(sdp->sd_lkstats); + sb->s_fs_info = NULL; kfree(sdp); } @@ -1314,7 +1317,6 @@ fail_iput: iput(sdp->sd_inode); fail_free: free_sbd(sdp); - sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 748125653d6c..c3c8842920d2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -764,7 +764,6 @@ fail_reg: fs_err(sdp, "error %d adding sysfs files\n", error); kobject_put(&sdp->sd_kobj); wait_for_completion(&sdp->sd_kobj_unregister); - sb->s_fs_info = NULL; return error; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 233abf598f65..3729391a18f3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1691,6 +1691,8 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, ioend_flags |= IOMAP_IOEND_UNWRITTEN; if (wpc->iomap.flags & IOMAP_F_SHARED) ioend_flags |= IOMAP_IOEND_SHARED; + if (folio_test_dropbehind(folio)) + ioend_flags |= IOMAP_IOEND_DONTCACHE; if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index df575a873ec6..9029cd216912 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -15,6 +15,7 @@ #include <linux/mempool.h> #include <linux/seq_file.h> #include <linux/writeback.h> +#include <linux/migrate.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" @@ -151,7 +152,59 @@ static inline void dec_io(struct folio *folio, blk_status_t status, handler(folio, anchor->status); } +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct meta_anchor *src_anchor = src->private; + struct metapage *mps[MPS_PER_PAGE] = {0}; + struct metapage *mp; + int i, rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (mp && metapage_locked(mp)) + return -EAGAIN; + } + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (!mp) + continue; + if (unlikely(insert_metapage(dst, mp))) { + /* If error, roll-back previosly inserted pages */ + for (int j = 0 ; j < i; j++) { + if (mps[j]) + remove_metapage(dst, mps[j]); + } + return -EAGAIN; + } + mps[i] = mp; + } + + /* Update the metapage and remove it from src */ + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = mps[i]; + if (mp) { + int page_offset = mp->data - folio_address(src); + + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + } + } + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #else + static inline struct metapage *folio_to_mp(struct folio *folio, int offset) { return folio->private; @@ -175,6 +228,35 @@ static inline void remove_metapage(struct folio *folio, struct metapage *mp) #define inc_io(folio) do {} while(0) #define dec_io(folio, status, handler) handler(folio, status) +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct metapage *mp; + int page_offset; + int rc; + + mp = folio_to_mp(src, 0); + if (metapage_locked(mp)) + return -EAGAIN; + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (unlikely(insert_metapage(dst, mp))) + return -EAGAIN; + + page_offset = mp->data - folio_address(src); + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #endif static inline struct metapage *alloc_metapage(gfp_t gfp_mask) @@ -554,6 +636,29 @@ static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask) return ret; } +#ifdef CONFIG_MIGRATION +/* + * metapage_migrate_folio - Migration function for JFS metapages + */ +static int metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + int expected_count; + + if (!src->private) + return filemap_migrate_folio(mapping, dst, src, mode); + + /* Check whether page does not have extra refs before we do more work */ + expected_count = folio_expected_ref_count(src) + 1; + if (folio_ref_count(src) != expected_count) + return -EAGAIN; + return __metapage_migrate_folio(mapping, dst, src, mode); +} +#else +#define metapage_migrate_folio NULL +#endif /* CONFIG_MIGRATION */ + static void metapage_invalidate_folio(struct folio *folio, size_t offset, size_t length) { @@ -570,6 +675,7 @@ const struct address_space_operations jfs_metapage_aops = { .release_folio = metapage_release_folio, .invalidate_folio = metapage_invalidate_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = metapage_migrate_folio, }; struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, diff --git a/fs/mount.h b/fs/mount.h index 7aecf2a60472..ad7173037924 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -7,10 +7,6 @@ extern struct list_head notify_list; -typedef __u32 __bitwise mntns_flags_t; - -#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) - struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -37,7 +33,6 @@ struct mnt_namespace { struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ - mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 552ad7f4d18b..2f2e93927f46 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1326,21 +1326,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, } EXPORT_SYMBOL_GPL(vfs_kern_mount); -struct vfsmount * -vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, - const char *name, void *data) -{ - /* Until it is worked out how to pass the user namespace - * through from the parent mount to the submount don't support - * unprivileged mounts with submounts. - */ - if (mountpoint->d_sb->s_user_ns != &init_user_ns) - return ERR_PTR(-EPERM); - - return vfs_kern_mount(type, SB_SUBMOUNT, name, data); -} -EXPORT_SYMBOL_GPL(vfs_submount); - static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -3649,7 +3634,7 @@ static int do_move_mount(struct path *old_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; - if (is_anon_ns(ns)) { + if (is_anon_ns(ns) && ns == p->mnt_ns) { /* * Ending up with two files referring to the root of the * same anonymous mount namespace would cause an error @@ -3657,16 +3642,7 @@ static int do_move_mount(struct path *old_path, * twice into the mount tree which would be rejected * later. But be explicit about it right here. */ - if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) - goto out; - - /* - * If this is an anonymous mount tree ensure that mount - * propagation can detect mounts that were just - * propagated to the target mount tree so we don't - * propagate onto them. - */ - ns->mntns_flags |= MNTNS_PROPAGATING; + goto out; } else if (is_anon_ns(p->mnt_ns)) { /* * Don't allow moving an attached mount tree to an @@ -3723,8 +3699,6 @@ static int do_move_mount(struct path *old_path, if (attached) put_mountpoint(old_mp); out: - if (is_anon_ns(ns)) - ns->mntns_flags &= ~MNTNS_PROPAGATING; unlock_mount(mp); if (!err) { if (attached) { @@ -3900,10 +3874,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) return PTR_ERR(m); mnt = real_mount(m); - /* The new mount record should have at least 2 refs to prevent it being - * expired before we get a chance to add it - */ - BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { @@ -3936,7 +3906,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) unlock_mount(mp); if (unlikely(err)) goto discard; - mntput(m); return 0; discard_locked: @@ -3950,7 +3919,6 @@ discard: namespace_unlock(); } mntput(m); - mntput(m); return err; } @@ -3987,11 +3955,14 @@ void mark_mounts_for_expiry(struct list_head *mounts) /* extract from the expiration list every vfsmount that matches the * following criteria: + * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!is_mounted(&mnt->mnt)) + continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0d1b6d35ff3b..18b3dc74c70e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -78,7 +78,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in * [!] NOTE: This must be run in the same thread as ->issue_read() was called * in as we access the readahead_control struct. */ -static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq, + struct readahead_control *ractl) { struct netfs_io_request *rreq = subreq->rreq; size_t rsize = subreq->len; @@ -86,7 +87,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); - if (rreq->ractl) { + if (ractl) { /* If we don't have sufficient folios in the rolling buffer, * extract a folioq's worth from the readahead region at a time * into the buffer. Note that this acquires a ref on each page @@ -99,7 +100,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) while (rreq->submitted < subreq->start + rsize) { ssize_t added; - added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl, + added = rolling_buffer_load_from_ra(&rreq->buffer, ractl, &put_batch); if (added < 0) return added; @@ -211,7 +212,8 @@ static void netfs_issue_read(struct netfs_io_request *rreq, * slicing up the region to be read according to available cache blocks and * network rsize. */ -static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +static void netfs_read_to_pagecache(struct netfs_io_request *rreq, + struct readahead_control *ractl) { struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; @@ -262,9 +264,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (ret < 0) { subreq->error = ret; /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } @@ -291,14 +293,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) break; issue: - slice = netfs_prepare_read_iterator(subreq); + slice = netfs_prepare_read_iterator(subreq, ractl); if (slice < 0) { ret = slice; subreq->error = ret; trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } size -= slice; @@ -312,7 +314,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } /* Defer error return as we may need to wait for outstanding I/O. */ @@ -359,18 +361,15 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); - rreq->ractl = ractl; rreq->submitted = rreq->start; if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0) goto cleanup_free; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, ractl); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_failed); } EXPORT_SYMBOL(netfs_readahead); @@ -389,7 +388,6 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo if (added < 0) return added; rreq->submitted = rreq->start + added; - rreq->ractl = (struct readahead_control *)1UL; return 0; } @@ -459,7 +457,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len); rreq->submitted = rreq->start + flen; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); if (sink) folio_put(sink); @@ -470,11 +468,11 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) folio_mark_uptodate(folio); } folio_unlock(folio); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -528,13 +526,13 @@ int netfs_read_folio(struct file *file, struct folio *folio) if (ret < 0) goto discard; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -685,11 +683,11 @@ retry: if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); if (ret < 0) goto error; - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_private_2_killable(folio); @@ -701,7 +699,7 @@ have_folio_no_wait: return 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); error: if (folio) { folio_unlock(folio); @@ -750,13 +748,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b4826360a411..72a3e6db2524 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -115,8 +115,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; - if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || - iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) ) { wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -386,7 +385,7 @@ out: wbc_detach_inode(&wbc); if (ret2 == -EIOCBQUEUED) return ret2; - if (ret == 0) + if (ret == 0 && ret2 < 0) ret = ret2; } diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 5e3f0aeb51f3..a05e13472baf 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -85,7 +85,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } } @@ -103,19 +103,16 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) rreq->netfs_ops->issue_read(subreq); if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - netfs_wait_for_pause(rreq); + netfs_wait_for_paused_read(rreq); if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) break; - if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && - test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) - break; cond_resched(); } while (size > 0); if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } return ret; @@ -144,7 +141,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) ret = netfs_dispatch_unbuffered_reads(rreq); if (!rreq->submitted) { - netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); inode_dio_end(rreq->inode); ret = 0; goto out; @@ -188,7 +185,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, orig_count, - NETFS_DIO_READ); + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_READ : NETFS_UNBUFFERED_READ); if (IS_ERR(rreq)) return PTR_ERR(rreq); @@ -236,7 +234,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i } out: - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; return ret; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 42ce53cc216e..fa9a5bf3c6d5 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -87,6 +87,8 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); + if (async) + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO @@ -105,19 +107,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * if (!async) { trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - ret = wreq->error; - if (ret == 0) { - ret = wreq->transferred; + ret = netfs_wait_for_write(wreq); + if (ret > 0) iocb->ki_pos += ret; - } } else { ret = -EIOCBQUEUED; } out: - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index b1722a82c03d..e4308457633c 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -192,8 +192,7 @@ EXPORT_SYMBOL(__fscache_clear_page_bits); /* * Deal with the completion of writing the data to the cache. */ -static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, - bool was_async) +static void fscache_wreq_done(void *priv, ssize_t transferred_or_error) { struct fscache_write_request *wreq = priv; @@ -202,8 +201,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, wreq->set_bits); if (wreq->term_func) - wreq->term_func(wreq->term_func_priv, transferred_or_error, - was_async); + wreq->term_func(wreq->term_func_priv, transferred_or_error); fscache_end_operation(&wreq->cache_resources); kfree(wreq); } @@ -255,14 +253,14 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, return; abandon_end: - return fscache_wreq_done(wreq, ret, false); + return fscache_wreq_done(wreq, ret); abandon_free: kfree(wreq); abandon: if (using_pgpriv2) fscache_clear_page_bits(mapping, start, len, cond); if (term_func) - term_func(term_func_priv, ret, false); + term_func(term_func_priv, ret); } EXPORT_SYMBOL(__fscache_write_to_cache); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 1c4f953c3d68..e2ee9183392b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,7 +23,7 @@ /* * buffered_read.c */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); @@ -62,6 +62,14 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq, enum netfs_folioq_trace trace); void netfs_reset_iter(struct netfs_io_subrequest *subreq); +void netfs_wake_collector(struct netfs_io_request *rreq); +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq); +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream); +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq); +void netfs_wait_for_paused_read(struct netfs_io_request *rreq); +void netfs_wait_for_paused_write(struct netfs_io_request *rreq); /* * objects.c @@ -71,9 +79,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, loff_t start, size_t len, enum netfs_io_origin origin); void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async); -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what); +void netfs_clear_subrequests(struct netfs_io_request *rreq); +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, @@ -92,11 +99,9 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq, /* * read_collect.c */ +bool netfs_read_collection(struct netfs_io_request *rreq); void netfs_read_collection_worker(struct work_struct *work); -void netfs_wake_read_collector(struct netfs_io_request *rreq); -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); -void netfs_wait_for_pause(struct netfs_io_request *rreq); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); /* * read_pgpriv2.c @@ -176,8 +181,8 @@ static inline void netfs_stat_d(atomic_t *stat) * write_collect.c */ int netfs_folio_written_back(struct folio *folio); +bool netfs_write_collection(struct netfs_io_request *wreq); void netfs_write_collection_worker(struct work_struct *work); -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async); /* * write_issue.c @@ -198,8 +203,8 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache); -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache); +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* @@ -255,6 +260,21 @@ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) } /* + * Clear and wake up a NETFS_RREQ_* flag bit on a request. + */ +static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq, + unsigned int rreq_flag, + enum netfs_rreq_trace trace) +{ + if (test_bit(rreq_flag, &rreq->flags)) { + trace_netfs_rreq(rreq, trace); + clear_bit_unlock(rreq_flag, &rreq->flags); + smp_mb__after_atomic(); /* Set flag before task state */ + wake_up(&rreq->waitq); + } +} + +/* * fscache-cache.c */ #ifdef CONFIG_PROC_FS diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 70ecc8f5f210..3db401d269e7 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READ_GAPS] = "RG", [NETFS_READ_SINGLE] = "R1", [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_UNBUFFERED_READ] = "UR", [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", [NETFS_WRITEBACK_SINGLE] = "W1", diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 7099aa07737a..43b67a28a8fa 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -313,3 +313,222 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) return true; } EXPORT_SYMBOL(netfs_release_folio); + +/* + * Wake the collection work item. + */ +void netfs_wake_collector(struct netfs_io_request *rreq) +{ + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { + queue_work(system_unbound_wq, &rreq->work); + } else { + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); + wake_up(&rreq->waitq); + } +} + +/* + * Mark a subrequest as no longer being in progress and, if need be, wake the + * collector. + */ +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct netfs_io_stream *stream = &rreq->io_streams[subreq->stream_nr]; + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ + + /* If we are at the head of the queue, wake up the collector. */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) + netfs_wake_collector(rreq); +} + +/* + * Wait for all outstanding I/O in a stream to quiesce. + */ +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream) +{ + struct netfs_io_subrequest *subreq; + DEFINE_WAIT(myself); + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + } + + finish_wait(&rreq->waitq, &myself); +} + +/* + * Perform collection in app thread if not offloaded to workqueue. + */ +static int netfs_collect_in_app(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + bool need_collect = false, inactive = true; + + for (int i = 0; i < NR_IO_STREAMS; i++) { + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[i]; + + if (!stream->active) + continue; + inactive = false; + trace_netfs_collect_stream(rreq, stream); + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, + rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + need_collect = true; + break; + } + } + + if (!need_collect && !inactive) + return 0; /* Sleep */ + + __set_current_state(TASK_RUNNING); + if (collector(rreq)) { + /* Drop the ref from the NETFS_RREQ_IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + return 1; /* Done */ + } + + if (inactive) { + WARN(true, "Failed to collect inactive req R=%08x\n", + rreq->debug_id); + cond_resched(); + } + return 2; /* Again */ +} + +/* + * Wait for a request to complete, successfully or otherwise. + */ +static ssize_t netfs_wait_for_request(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + ssize_t ret; + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + continue; + } + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + +all_collected: + finish_wait(&rreq->waitq, &myself); + + ret = rreq->error; + if (ret == 0) { + ret = rreq->transferred; + switch (rreq->origin) { + case NETFS_DIO_READ: + case NETFS_DIO_WRITE: + case NETFS_READ_SINGLE: + case NETFS_UNBUFFERED_READ: + case NETFS_UNBUFFERED_WRITE: + break; + default: + if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + break; + } + } + + return ret; +} + +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_request(rreq, netfs_read_collection); +} + +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_request(rreq, netfs_write_collection); +} + +/* + * Wait for a paused operation to unpause or complete in some manner. + */ +static void netfs_wait_for_pause(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + continue; + } + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + +all_collected: + finish_wait(&rreq->waitq, &myself); +} + +void netfs_wait_for_paused_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_read_collection); +} + +void netfs_wait_for_paused_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_write_collection); +} diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index dc6b41ef18b0..e8c99738b5bb 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -10,6 +10,8 @@ #include <linux/delay.h> #include "internal.h" +static void netfs_free_request(struct work_struct *work); + /* * Allocate an I/O request and initialise it. */ @@ -34,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } memset(rreq, 0, kmem_cache_size(cache)); + INIT_WORK(&rreq->cleanup_work, netfs_free_request); rreq->start = start; rreq->len = len; rreq->origin = origin; @@ -49,13 +52,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); init_waitqueue_head(&rreq->waitq); - refcount_set(&rreq->ref, 1); + refcount_set(&rreq->ref, 2); if (origin == NETFS_READAHEAD || origin == NETFS_READPAGE || origin == NETFS_READ_GAPS || origin == NETFS_READ_SINGLE || origin == NETFS_READ_FOR_WRITE || + origin == NETFS_UNBUFFERED_READ || origin == NETFS_DIO_READ) { INIT_WORK(&rreq->work, netfs_read_collection_worker); rreq->io_streams[0].avail = true; @@ -64,8 +68,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (file && file->f_flags & O_NONBLOCK) - __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -75,7 +77,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } atomic_inc(&ctx->io_count); - trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; @@ -89,7 +91,7 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); } -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) +void netfs_clear_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; @@ -101,8 +103,7 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) subreq = list_first_entry(&stream->subrequests, struct netfs_io_subrequest, rreq_link); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async, - netfs_sreq_trace_put_clear); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear); } } } @@ -118,13 +119,19 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); + container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + + /* Cancel/flush the result collection worker. That does not carry a + * ref of its own, so we must wait for it somewhere. + */ + cancel_work_sync(&rreq->work); + netfs_proc_del_rreq(rreq); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) @@ -145,8 +152,7 @@ static void netfs_free_request(struct work_struct *work) call_rcu(&rreq->rcu, netfs_free_request_rcu); } -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what) +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) { unsigned int debug_id; bool dead; @@ -156,15 +162,8 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, debug_id = rreq->debug_id; dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - WARN_ON(1); - } else { - netfs_free_request(&rreq->work); - } - } + if (dead) + WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); } } @@ -206,8 +205,7 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, what); } -static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, - bool was_async) +static void netfs_free_subrequest(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -216,10 +214,10 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, rreq->netfs_ops->free_subrequest(subreq); mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); + netfs_put_request(rreq, netfs_rreq_trace_put_subreq); } -void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, +void netfs_put_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what) { unsigned int debug_index = subreq->debug_index; @@ -230,5 +228,5 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, dead = __refcount_dec_and_test(&subreq->ref, &r); trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); if (dead) - netfs_free_subrequest(subreq, was_async); + netfs_free_subrequest(subreq); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 23c75755ad4e..96ee18af28ef 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,14 +83,12 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { - _debug("no unlock"); - } else { - trace_netfs_folio(folio, netfs_folio_trace_read_unlock); - folio_unlock(folio); - } + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_unlock); + folio_unlock(folio); } folioq_clear(folioq, slot); @@ -280,9 +278,13 @@ reassess: stream->need_retry = true; notes |= NEED_RETRY | MADE_PROGRESS; break; + } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) { + notes |= MADE_PROGRESS; } else { if (!stream->failed) - stream->transferred = stream->collected_to - rreq->start; + stream->transferred += transferred; + if (front->transferred < front->len) + set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags); notes |= MADE_PROGRESS; } @@ -297,7 +299,7 @@ reassess: struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&rreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & ABANDON_SREQ ? netfs_sreq_trace_put_abandon : netfs_sreq_trace_put_done); @@ -311,14 +313,8 @@ reassess: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) { - trace_netfs_rreq(rreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&rreq->waitq); - } - if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(rreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess; } @@ -342,24 +338,10 @@ need_retry: */ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned int i; - /* Collect unbuffered reads and direct reads, adding up the transfer - * sizes until we find the first short or failed subrequest. - */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - rreq->transferred += subreq->transferred; - - if (subreq->transferred < subreq->len || - test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { - rreq->error = subreq->error; - break; - } - } - - if (rreq->origin == NETFS_DIO_READ) { + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) { for (i = 0; i < rreq->direct_bv_count; i++) { flush_dcache_page(rreq->direct_bv[i].bv_page); // TODO: cifs marks pages in the destination buffer @@ -377,7 +359,8 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); - if (rreq->origin == NETFS_DIO_READ) + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) inode_dio_end(rreq->inode); } @@ -410,7 +393,7 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq) * Note that we're in normal kernel thread context at this point, possibly * running on a workqueue. */ -static void netfs_read_collection(struct netfs_io_request *rreq) +bool netfs_read_collection(struct netfs_io_request *rreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -420,11 +403,11 @@ static void netfs_read_collection(struct netfs_io_request *rreq) * queue is empty. */ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) - return; + return false; smp_rmb(); /* Read ALL_QUEUED before subreq lists. */ if (!list_empty(&stream->subrequests)) - return; + return false; /* Okay, declare that all I/O is complete. */ rreq->transferred = stream->transferred; @@ -433,6 +416,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq) //netfs_rreq_is_still_valid(rreq); switch (rreq->origin) { + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: case NETFS_READ_GAPS: netfs_rreq_assess_dio(rreq); @@ -445,14 +429,15 @@ static void netfs_read_collection(struct netfs_io_request *rreq) } task_io_account_read(rreq->transferred); - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + netfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); netfs_unlock_abandoned_read_pages(rreq); if (unlikely(rreq->copy_to_cache)) netfs_pgpriv2_end_copy_to_cache(rreq); + return true; } void netfs_read_collection_worker(struct work_struct *work) @@ -460,26 +445,12 @@ void netfs_read_collection_worker(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); netfs_see_request(rreq, netfs_rreq_trace_see_work); - if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - netfs_read_collection(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_work); -} - -/* - * Wake the collection work item. - */ -void netfs_wake_read_collector(struct netfs_io_request *rreq) -{ - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && - !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - if (!work_pending(&rreq->work)) { - netfs_get_request(rreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &rreq->work)) - netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq); - } - } else { - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); - wake_up(&rreq->waitq); + if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_read_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -511,7 +482,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq) list_is_first(&subreq->rreq_link, &stream->subrequests) ) { __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -535,7 +506,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -582,23 +552,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ - - /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests) || - test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) - netfs_wake_read_collector(rreq); - - netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_read_subreq_terminated); /* * Handle termination of a read from the cache. */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async) +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = priv; @@ -613,94 +575,3 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool } netfs_read_subreq_terminated(subreq); } - -/* - * Wait for the read operation to complete, successfully or otherwise. - */ -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - ssize_t ret; - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - - ret = rreq->error; - if (ret == 0) { - ret = rreq->transferred; - switch (rreq->origin) { - case NETFS_DIO_READ: - case NETFS_READ_SINGLE: - ret = rreq->transferred; - break; - default: - if (rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - break; - } - } - - return ret; -} - -/* - * Wait for a paused read operation to unpause or complete in some manner. - */ -void netfs_wait_for_pause(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || - !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); -} diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index cf7727060215..5bbe906a551d 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -116,7 +116,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); @@ -155,7 +155,7 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq) smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags); - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); creq->copy_to_cache = NULL; } diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0f294b26e08c..b99e84a8170a 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -173,7 +173,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -257,35 +257,15 @@ abandon: */ void netfs_retry_reads(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); netfs_stat(&netfs_n_rh_retry_read_req); - set_bit(NETFS_RREQ_RETRYING, &rreq->flags); - /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - continue; - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - for (;;) { - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - break; - - trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - } + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); + netfs_wait_for_in_progress_stream(rreq, stream); clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fea0ecdecc53..fa622a6cd56d 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -142,7 +142,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); return ret; cancel: - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); return ret; } @@ -185,11 +185,11 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite netfs_single_dispatch_read(rreq); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret; cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 3fca59e6475d..e2b102ffb768 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -280,7 +280,7 @@ reassess_streams: struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&wreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_done); @@ -321,18 +321,14 @@ reassess_streams: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&wreq->waitq); - } - if (notes & NEED_REASSESS) { + if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess_streams; } - if (notes & MADE_PROGRESS) { + + if (notes & NEED_REASSESS) { //cond_resched(); goto reassess_streams; } @@ -356,30 +352,21 @@ need_retry: /* * Perform the collection of subrequests, folios and encryption buffers. */ -void netfs_write_collection_worker(struct work_struct *work) +bool netfs_write_collection(struct netfs_io_request *wreq) { - struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); struct netfs_inode *ictx = netfs_inode(wreq->inode); size_t transferred; int s; _enter("R=%x", wreq->debug_id); - netfs_see_request(wreq, netfs_rreq_trace_see_work); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } - netfs_collect_write_results(wreq); /* We're done when the app thread has finished posting subreqs and all * the queues in all the streams are empty. */ - if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) + return false; smp_rmb(); /* Read ALL_QUEUED before lists. */ transferred = LONG_MAX; @@ -387,10 +374,8 @@ void netfs_write_collection_worker(struct work_struct *work) struct netfs_io_stream *stream = &wreq->io_streams[s]; if (!stream->active) continue; - if (!list_empty(&stream->subrequests)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!list_empty(&stream->subrequests)) + return false; if (stream->transferred < transferred) transferred = stream->transferred; } @@ -428,8 +413,8 @@ void netfs_write_collection_worker(struct work_struct *work) inode_dio_end(wreq->inode); _debug("finished"); - trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -440,19 +425,21 @@ void netfs_write_collection_worker(struct work_struct *work) wreq->iocb = VFS_PTR_POISON; } - netfs_clear_subrequests(wreq, false); - netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete); + netfs_clear_subrequests(wreq); + return true; } -/* - * Wake the collection work item. - */ -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) +void netfs_write_collection_worker(struct work_struct *work) { - if (!work_pending(&wreq->work)) { - netfs_get_request(wreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &wreq->work)) - netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq); + struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + + netfs_see_request(rreq, netfs_rreq_trace_see_work); + if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_write_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -460,7 +447,6 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * netfs_write_subrequest_terminated - Note the termination of a write operation. * @_op: The I/O request that has terminated. * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous * * This tells the library that a contributory write I/O operation has * terminated, one way or another, and that it should collect the results. @@ -470,21 +456,16 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * negative error code. The library will look after reissuing I/O operations * as appropriate and writing downloaded data to the cache. * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - * * When this is called, ownership of the subrequest is transferred back to the * library, along with a ref. * * Note that %_op is a void* so that the function can be passed to * kiocb::term_func without the need for a casting wrapper. */ -void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async) +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = _op; struct netfs_io_request *wreq = subreq->rreq; - struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); @@ -495,8 +476,6 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write_done); break; - case NETFS_INVALID_WRITE: - break; default: BUG(); } @@ -536,15 +515,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - - /* If we are at the head of the queue, wake up the collector, - * transferring a ref to it if we were the ones to do so. - */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) - netfs_wake_write_collector(wreq, was_async); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 77279fc5b5a7..50bee2c4130d 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -134,7 +134,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: wreq->error = -ENOMEM; - netfs_put_request(wreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(wreq, netfs_rreq_trace_put_failed); return ERR_PTR(-ENOMEM); } @@ -233,7 +233,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) - return netfs_write_subrequest_terminated(subreq, subreq->error, false); + return netfs_write_subrequest_terminated(subreq, subreq->error); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); @@ -542,7 +542,7 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq) } if (needs_poke) - netfs_wake_write_collector(wreq, false); + netfs_wake_collector(wreq); } /* @@ -576,6 +576,7 @@ int netfs_writepages(struct address_space *mapping, goto couldnt_start; } + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writeback); netfs_stat(&netfs_n_wh_writepages); @@ -599,8 +600,9 @@ int netfs_writepages(struct address_space *mapping, netfs_end_issue_write(wreq); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", error); return error; @@ -673,11 +675,11 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c /* * End a write operation used when writing through the pagecache. */ -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache) +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) { struct netfs_inode *ictx = netfs_inode(wreq->inode); - int ret; + ssize_t ret; _enter("R=%x", wreq->debug_id); @@ -688,13 +690,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - if (wreq->iocb) { + if (wreq->iocb) ret = -EIOCBQUEUED; - } else { - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - ret = wreq->error; - } - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + else + ret = netfs_wait_for_write(wreq); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } @@ -722,10 +722,8 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t start += part; len -= part; rolling_buffer_advance(&wreq->buffer, part); - if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); - wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags)); - } + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) + netfs_wait_for_paused_write(wreq); if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) break; } @@ -885,7 +883,8 @@ int netfs_writeback_single(struct address_space *mapping, goto couldnt_start; } - trace_netfs_write(wreq, netfs_write_trace_writeback); + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); + trace_netfs_write(wreq, netfs_write_trace_writeback_single); netfs_stat(&netfs_n_wh_writepages); if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) @@ -914,8 +913,9 @@ stop: set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", ret); return ret; diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 545d33079a77..9d1d8a8bab72 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -39,9 +39,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { - struct iov_iter source = subreq->io_iter; + struct iov_iter source; - iov_iter_revert(&source, subreq->len - source.count); + netfs_reset_iter(subreq); + source = subreq->io_iter; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -131,7 +132,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_discard); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -199,7 +200,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, */ void netfs_retry_writes(struct netfs_io_request *wreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; int s; @@ -208,16 +208,13 @@ void netfs_retry_writes(struct netfs_io_request *wreq) /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ + set_bit(NETFS_RREQ_RETRYING, &wreq->flags); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; - if (!stream->active) - continue; - - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - } + if (stream->active) + netfs_wait_for_in_progress_stream(wreq, stream); } + clear_bit(NETFS_RREQ_RETRYING, &wreq->flags); // TODO: Enc: Fetch changed partial pages // TODO: Enc: Reencrypt content if needed. diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6d63b958c4bb..cf35ad3f818a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); @@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp) if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); @@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) - nfs_local_probe(new); + nfs_local_probe_async(new); return new; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 8bdbc4dca89c..10ef46e29b25 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1021,13 +1021,6 @@ out: nfs_inode_find_state_and_recover(inode, stateid); } -void nfs_remove_bad_delegation(struct inode *inode, - const nfs4_stateid *stateid) -{ - nfs_revoke_delegation(inode, stateid); -} -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); - void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { @@ -1070,6 +1063,24 @@ out_rcu_unlock: } /** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6909cafab68..df4807460596 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1129,6 +1129,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); break; case -NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; case -NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); break; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 4a304cf17c4b..656d5c50bbce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, * keep ds_clp even if DS is local, so that if local IO cannot * proceed somehow, we can fall back to NFS whenever we want. */ - nfs_local_probe(ds->ds_clp); + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e278a1ad1ca3..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) sreq = netfs->sreq; if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 119e447758b9..8ab7868807a7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -557,6 +557,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) @@ -633,6 +635,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; @@ -701,14 +731,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); - nfs_update_timestamps(inode, attr->ia_valid); + if (attr->ia_valid & ATTR_MTIME_SET) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; + if (attr->ia_valid & ATTR_ATIME_SET) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } } /* Optimization: if the end result is no change, don't RPC */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6655e5f32ec6..69c2c10ee658 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ -extern void nfs_local_probe(struct nfs_client *); extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 4ec952f9f47d..510d0a16cfe9 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp) * - called after alloc_client and init_client (so cl_rpcclient exists) * - this function is idempotent, it can be called for old or new clients */ -void nfs_local_probe(struct nfs_client *clp) +static void nfs_local_probe(struct nfs_client *clp) { /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ if (!localio_enabled || @@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *clp) nfs_localio_enable_client(clp); nfs_uuid_end(&clp->cl_uuid); } -EXPORT_SYMBOL_GPL(nfs_local_probe); void nfs_local_probe_async_work(struct work_struct *work) { struct nfs_client *clp = container_of(work, struct nfs_client, cl_local_probe_work); + if (!refcount_inc_not_zero(&clp->cl_count)) + return; nfs_local_probe(clp); + nfs_put_client(clp); } void nfs_local_probe_async(struct nfs_client *clp) @@ -207,14 +209,16 @@ void nfs_local_probe_async(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_local_probe_async); -static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf) +static inline void nfs_local_file_put(struct nfsd_file *localio) { - return nfs_to->nfsd_file_get(nf); -} + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; -static inline void nfs_local_file_put(struct nfsd_file *nf) -{ - nfs_to->nfsd_file_put(nf); + nfs_to_nfsd_file_put_local(&nf); } /* @@ -226,12 +230,13 @@ static inline void nfs_local_file_put(struct nfsd_file *nf) static struct nfsd_file * __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t mode) { struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, - cred, fh, nfl, mode); + cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { int status = PTR_ERR(localio); trace_nfs_local_open_fh(fh, mode, status); @@ -258,7 +263,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { - struct nfsd_file *nf, *new, __rcu **pnf; + struct nfsd_file *nf, __rcu **pnf; if (!nfs_server_is_local(clp)) return NULL; @@ -270,29 +275,9 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, else pnf = &nfl->ro_file; - new = NULL; - rcu_read_lock(); - nf = rcu_dereference(*pnf); - if (!nf) { - rcu_read_unlock(); - new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); - if (IS_ERR(new)) - return NULL; - rcu_read_lock(); - /* try to swap in the pointer */ - spin_lock(&clp->cl_uuid.lock); - nf = rcu_dereference_protected(*pnf, 1); - if (!nf) { - nf = new; - new = NULL; - rcu_assign_pointer(*pnf, nf); - } - spin_unlock(&clp->cl_uuid.lock); - } - nf = nfs_local_file_get(nf); - rcu_read_unlock(); - if (new) - nfs_to_nfsd_file_put_local(new); + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; return nf; } EXPORT_SYMBOL_GPL(nfs_local_open_fh); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 973aed9cc5fe..7f1ec9c67ff2 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -195,7 +195,6 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ if (timeout <= 0) goto out_fc; diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0282d93c8bcc..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5cf52ece96ac..01c01f45358b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -146,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -169,7 +170,31 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + + inode_unlock(inode); + return err; +} + +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b1b663468249..4cc915d5741d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -174,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -649,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* * Encode READ_PLUS request */ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, @@ -1511,6 +1544,37 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* * Decode READ_PLUS request */ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7d383d29a995..d3ca91f60fc1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -67,8 +67,7 @@ struct nfs4_minor_version_ops { void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - const nfs4_stateid *, - const struct cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); void (*session_trunk)(struct rpc_clnt *clnt, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1cd9652f3c28..5e9d66f3466c 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b1d2122bd5a7..341740fa293d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, const struct cred *, bool); #endif @@ -325,14 +325,14 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); if (!(cache_validity & NFS_INO_INVALID_MTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_MODIFY; + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); if (!(cache_validity & NFS_INO_INVALID_CTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_METADATA; + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); } } @@ -2903,16 +2903,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2921,6 +2919,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -3976,8 +3975,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | - FATTR4_WORD2_OPEN_ARGUMENTS; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -5164,13 +5164,15 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ } static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, - struct nfs4_createdata *data) + struct nfs4_createdata *data, int *statusp) { - int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); - if (status) - return ERR_PTR(status); + if (*statusp) + return NULL; spin_lock(&dir->i_lock); /* Creating a directory bumps nlink in the parent */ @@ -5179,7 +5181,11 @@ static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, data->res.fattr->time_start, NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); - return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; } static void nfs4_free_createdata(struct nfs4_createdata *data) @@ -5240,17 +5246,18 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - struct dentry *ret = ERR_PTR(-ENOMEM); + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - ret = nfs4_do_mkdir(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: @@ -5273,11 +5280,12 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); - err = PTR_ERR_OR_ZERO(alias); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); @@ -6211,6 +6219,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6285,6 +6295,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); @@ -10611,7 +10624,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, + nfs4_stateid *stateid, const struct cred *cred, bool privileged) { @@ -10651,6 +10664,7 @@ static int nfs41_free_stateid(struct nfs_server *server, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } @@ -10817,6 +10831,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE @@ -10852,7 +10867,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4; size_t left = size; error = generic_listxattr(dentry, list, left); @@ -10875,8 +10890,16 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } + + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; - error += error2 + error3; + error += error2 + error3 + error4; if (size && error > size) return -ERANGE; return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 55bef5fbfa47..318afde38057 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7711,6 +7711,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 91ef486f40b9..b4ccdf78d4dd 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -830,10 +830,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .servername = clp->cl_hostname, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto) + if (da->da_transport != clp->cl_proto && + clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) continue; + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; /* Add this address as an alias */ @@ -841,6 +847,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_test_and_add_xprt, NULL); continue; } + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + da->da_transport = XPRT_TRANSPORT_TCP_TLS; clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, da->da_transport, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 81bd1b9aba17..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9eea9e62afc9..91b5503b6f74 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1052,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and @@ -1308,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc) if (IS_ERR(server)) return PTR_ERR(server); + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 37cb2b776435..545148d42dcc 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -387,6 +387,33 @@ static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -405,6 +432,7 @@ void nfs_sysfs_add_server(struct nfs_server *server) server->s_sysfs_id, ret); nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 23df8b214474..374fc6b34c79 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -632,19 +632,19 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct folio *folio, - struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -654,7 +654,6 @@ static int nfs_page_async_flush(struct folio *folio, if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -662,28 +661,20 @@ static int nfs_page_async_flush(struct folio *folio, */ if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - if (wbc->sync_mode == WB_SYNC_NONE) - ret = AOP_WRITEPAGE_ACTIVATE; folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; - } else - nfs_add_stats(folio->mapping->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; + return ret; + } + + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; + out_launder: nfs_write_error(req, ret); return 0; } -static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - nfs_pageio_cond_complete(pgio, folio->index); - return nfs_page_async_flush(folio, wbc, pgio); -} - /* * Write an mmapped page to the server. */ @@ -703,17 +694,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -static int nfs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(folio, wbc, data); - if (ret != AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); - return ret; -} - static void nfs_io_completion_commit(void *inode) { nfs_commit_inode(inode, 0); @@ -749,11 +729,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } do { + struct folio *folio = NULL; + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, - &pgio); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 6a0bdea6d644..05c7c16e37ab 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -151,8 +151,7 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client); */ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) { - LIST_HEAD(local_files); - struct nfs_file_localio *nfl, *tmp; + struct nfs_file_localio *nfl; spin_lock(&nfs_uuid->lock); if (unlikely(!rcu_access_pointer(nfs_uuid->net))) { @@ -166,17 +165,42 @@ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) nfs_uuid->dom = NULL; } - list_splice_init(&nfs_uuid->files, &local_files); - spin_unlock(&nfs_uuid->lock); - /* Walk list of files and ensure their last references dropped */ - list_for_each_entry_safe(nfl, tmp, &local_files, list) { - nfs_close_local_fh(nfl); + + while ((nfl = list_first_entry_or_null(&nfs_uuid->files, + struct nfs_file_localio, + list)) != NULL) { + /* If nfs_uuid is already NULL, nfs_close_local_fh is + * closing and we must wait, else we unlink and close. + */ + if (rcu_access_pointer(nfl->nfs_uuid) == NULL) { + /* nfs_close_local_fh() is doing the + * close and we must wait. until it unlinks + */ + wait_var_event_spinlock(nfl, + list_first_entry_or_null( + &nfs_uuid->files, + struct nfs_file_localio, + list) != nfl, + &nfs_uuid->lock); + continue; + } + + /* Remove nfl from nfs_uuid->files list */ + list_del_init(&nfl->list); + spin_unlock(&nfs_uuid->lock); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); cond_resched(); - } - spin_lock(&nfs_uuid->lock); - BUG_ON(!list_empty(&nfs_uuid->files)); + spin_lock(&nfs_uuid->lock); + /* Now we can allow racing nfs_close_local_fh() to + * skip the locking. + */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + } /* Remove client from nn->local_clients */ if (nfs_uuid->list_lock) { @@ -237,6 +261,7 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, struct rpc_clnt *rpc_clnt, const struct cred *cred, const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t fmode) { struct net *net; @@ -261,10 +286,9 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, rcu_read_unlock(); /* We have an implied reference to net thanks to nfsd_net_try_get */ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, - cred, nfs_fh, fmode); - if (IS_ERR(localio)) - nfs_to_nfsd_net_put(net); - else + cred, nfs_fh, pnf, fmode); + nfs_to_nfsd_net_put(net); + if (!IS_ERR(localio)) nfs_uuid_add_file(uuid, nfl); return localio; @@ -273,8 +297,6 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh); void nfs_close_local_fh(struct nfs_file_localio *nfl) { - struct nfsd_file *ro_nf = NULL; - struct nfsd_file *rw_nf = NULL; nfs_uuid_t *nfs_uuid; rcu_read_lock(); @@ -285,28 +307,39 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl) return; } - ro_nf = rcu_access_pointer(nfl->ro_file); - rw_nf = rcu_access_pointer(nfl->rw_file); - if (ro_nf || rw_nf) { - spin_lock(&nfs_uuid->lock); - if (ro_nf) - ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1); - if (rw_nf) - rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1); - - /* Remove nfl from nfs_uuid->files list */ - RCU_INIT_POINTER(nfl->nfs_uuid, NULL); - list_del_init(&nfl->list); + spin_lock(&nfs_uuid->lock); + if (!rcu_access_pointer(nfl->nfs_uuid)) { + /* nfs_uuid_put has finished here */ spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); - - if (ro_nf) - nfs_to_nfsd_file_put_local(ro_nf); - if (rw_nf) - nfs_to_nfsd_file_put_local(rw_nf); return; } + if (list_empty(&nfs_uuid->files)) { + /* nfs_uuid_put() has started closing files, wait for it + * to finished + */ + spin_unlock(&nfs_uuid->lock); + rcu_read_unlock(); + wait_var_event(&nfl->nfs_uuid, + rcu_access_pointer(nfl->nfs_uuid) == NULL); + return; + } + /* tell nfs_uuid_put() to wait for us */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); + + /* Remove nfl from nfs_uuid->files list and signal nfs_uuid_put() + * that we are done. The moment we drop the spinlock the + * nfs_uuid could be freed. + */ + spin_lock(&nfs_uuid->lock); + list_del_init(&nfl->list); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + spin_unlock(&nfs_uuid->lock); } EXPORT_SYMBOL_GPL(nfs_close_local_fh); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ab85e6a2454f..e108b6c705b4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -378,15 +378,41 @@ nfsd_file_put(struct nfsd_file *nf) * the reference of the nfsd_file. */ struct net * -nfsd_file_put_local(struct nfsd_file *nf) +nfsd_file_put_local(struct nfsd_file __rcu **pnf) { - struct net *net = nf->nf_net; + struct nfsd_file *nf; + struct net *net = NULL; - nfsd_file_put(nf); + nf = unrcu_pointer(xchg(pnf, NULL)); + if (nf) { + net = nf->nf_net; + nfsd_file_put(nf); + } return net; } /** + * nfsd_file_get_local - get nfsd_file reference and reference to net + * @nf: nfsd_file of which to put the reference + * + * Get reference to both the nfsd_file and nf->nf_net. + */ +struct nfsd_file * +nfsd_file_get_local(struct nfsd_file *nf) +{ + struct net *net = nf->nf_net; + + if (nfsd_net_try_get(net)) { + nf = nfsd_file_get(nf); + if (!nf) + nfsd_net_put(net); + } else { + nf = NULL; + } + return nf; +} + +/** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. * diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 5865f9c72712..722b26c71e45 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -62,7 +62,8 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -struct net *nfsd_file_put_local(struct nfsd_file *nf); +struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); +struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 238647fa379e..80d9ff6608a7 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -24,21 +24,6 @@ #include "filecache.h" #include "cache.h" -static const struct nfsd_localio_operations nfsd_localio_ops = { - .nfsd_net_try_get = nfsd_net_try_get, - .nfsd_net_put = nfsd_net_put, - .nfsd_open_local_fh = nfsd_open_local_fh, - .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get = nfsd_file_get, - .nfsd_file_put = nfsd_file_put, - .nfsd_file_file = nfsd_file_file, -}; - -void nfsd_localio_ops_init(void) -{ - nfs_to = &nfsd_localio_ops; -} - /** * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file * @@ -47,6 +32,7 @@ void nfsd_localio_ops_init(void) * @rpc_clnt: rpc_clnt that the client established * @cred: cred that the client established * @nfs_fh: filehandle to lookup + * @nfp: place to find the nfsd_file, or store it if it was non-NULL * @fmode: fmode_t to use for open * * This function maps a local fh to a path on a local filesystem. @@ -57,10 +43,11 @@ void nfsd_localio_ops_init(void) * set. Caller (NFS client) is responsible for calling nfsd_net_put and * nfsd_file_put (via nfs_to_nfsd_file_put_local). */ -struct nfsd_file * +static struct nfsd_file * nfsd_open_local_fh(struct net *net, struct auth_domain *dom, struct rpc_clnt *rpc_clnt, const struct cred *cred, - const struct nfs_fh *nfs_fh, const fmode_t fmode) + const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf, + const fmode_t fmode) { int mayflags = NFSD_MAY_LOCALIO; struct svc_cred rq_cred; @@ -71,6 +58,15 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (nfs_fh->size > NFS4_FHSIZE) return ERR_PTR(-EINVAL); + if (!nfsd_net_try_get(net)) + return ERR_PTR(-ENXIO); + + rcu_read_lock(); + localio = nfsd_file_get(rcu_dereference(*pnf)); + rcu_read_unlock(); + if (localio) + return localio; + /* nfs_fh -> svc_fh */ fh_init(&fh, NFS4_FHSIZE); fh.fh_handle.fh_size = nfs_fh->size; @@ -92,9 +88,47 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (rq_cred.cr_group_info) put_group_info(rq_cred.cr_group_info); + if (!IS_ERR(localio)) { + struct nfsd_file *new; + if (!nfsd_net_try_get(net)) { + nfsd_file_put(localio); + nfsd_net_put(net); + return ERR_PTR(-ENXIO); + } + nfsd_file_get(localio); + again: + new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio))); + if (new) { + /* Some other thread installed an nfsd_file */ + if (nfsd_file_get(new) == NULL) + goto again; + /* + * Drop the ref we were going to install and the + * one we were going to return. + */ + nfsd_file_put(localio); + nfsd_file_put(localio); + localio = new; + } + } else + nfsd_net_put(net); + return localio; } -EXPORT_SYMBOL_GPL(nfsd_open_local_fh); + +static const struct nfsd_localio_operations nfsd_localio_ops = { + .nfsd_net_try_get = nfsd_net_try_get, + .nfsd_net_put = nfsd_net_put, + .nfsd_open_local_fh = nfsd_open_local_fh, + .nfsd_file_put_local = nfsd_file_put_local, + .nfsd_file_get_local = nfsd_file_get_local, + .nfsd_file_file = nfsd_file_file, +}; + +void nfsd_localio_ops_init(void) +{ + nfs_to = &nfsd_localio_ops; +} /* * UUID_IS_LOCAL XDR functions diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 0d8f7fb15c2e..dd0c8e560ef6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { - if (unlikely(ret == -ENOENT)) + if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); + ret = -EINVAL; + } goto out; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 893ab36824cc..2d8dc6b35b54 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(bmap, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -EINVAL; + if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2f850a18d6e7..946b0d3534a5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio, if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_segment(sb); - else if (wbc->for_reclaim) - nilfs_flush_segment(sb, inode->i_ino); return err; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 83970d97840b..61a4141f8d6b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) spin_unlock(&sci->sc_state_lock); } -/** - * nilfs_flush_segment - trigger a segment construction for resource control - * @sb: super block - * @ino: inode number of the file to be flushed out. - */ -void nilfs_flush_segment(struct super_block *sb, ino_t ino) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_sc_info *sci = nilfs->ns_writer; - - if (!sci || nilfs_doing_construction()) - return; - nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); - /* assign bit 0 to data files */ -} - struct nilfs_segctor_wait_request { wait_queue_entry_t wq; __u32 seq; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index f723f47ddc4e..4b39ed43ae72 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *); extern int nilfs_construct_segment(struct super_block *); extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, loff_t, loff_t); -extern void nilfs_flush_segment(struct super_block *, ino_t); extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, void **); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 34ed242e1063..1e99a35691cd 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -913,7 +913,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; struct ntfs_sb_info *sbi = ni->mi.sbi; - struct page *page, **pages = NULL; + struct page **pages = NULL; + struct folio *folio; size_t written = 0; u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = 1u << frame_bits; @@ -923,7 +924,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) u64 frame_vbo; pgoff_t index; bool frame_uptodate; - struct folio *folio; if (frame_size < PAGE_SIZE) { /* @@ -977,8 +977,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) pages_per_frame); if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -989,10 +988,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ip = off >> PAGE_SHIFT; off = offset_in_page(valid); for (; ip < pages_per_frame; ip++, off = 0) { - page = pages[ip]; - folio = page_folio(page); - zero_user_segment(page, off, PAGE_SIZE); - flush_dcache_page(page); + folio = page_folio(pages[ip]); + folio_zero_segment(folio, off, PAGE_SIZE); + flush_dcache_folio(folio); folio_mark_uptodate(folio); } @@ -1001,8 +999,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); @@ -1046,8 +1043,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -1065,10 +1061,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { size_t cp, tail = PAGE_SIZE - off; - page = pages[ip]; - cp = copy_page_from_iter_atomic(page, off, + folio = page_folio(pages[ip]); + cp = copy_folio_from_iter_atomic(folio, off, min(tail, bytes), from); - flush_dcache_page(page); + flush_dcache_folio(folio); copied += cp; bytes -= cp; @@ -1088,9 +1084,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - ClearPageDirty(page); - folio = page_folio(page); + folio = page_folio(pages[ip]); + folio_clear_dirty(folio); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index fce9beb214f0..43e652a2adaf 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1483,7 +1483,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index e272429da3db..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -674,7 +674,7 @@ out_put: break; } out: - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ddd761cf44c8..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void) memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); diff --git a/fs/pipe.c b/fs/pipe.c index da45edd68c41..45077c37bad1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,6 +26,7 @@ #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> +#include <linux/sort.h> #include <linux/uaccess.h> #include <asm/ioctls.h> @@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int pipe_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) diff --git a/fs/pnode.c b/fs/pnode.c index fb77427df39e..ffd429b760d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -231,8 +231,8 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; - /* skip if m is in the anon_ns we are emptying */ - if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING) + /* skip if m is in the anon_ns */ + if (is_anon_ns(m->mnt_ns)) return 0; if (peers(m, last_dest)) { diff --git a/fs/proc/base.c b/fs/proc/base.c index fe33a5843fbd..c667702dc69b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); @@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; diff --git a/fs/proc/page.c b/fs/proc/page.c index 23fc771100ae..999af26c7298 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -22,6 +22,12 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; + static inline unsigned long get_max_dump_pfn(void) { #ifdef CONFIG_SPARSEMEM @@ -37,19 +43,17 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page mapcounts - * - * Each entry is a u64 representing the corresponding - * physical page mapcount. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; + u64 info; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -59,24 +63,34 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { - struct page *page; - u64 mapcount = 0; - /* * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ page = pfn_to_online_page(pfn); - if (page) { - struct folio *folio = page_folio(page); - if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) - mapcount = folio_precise_page_mapcount(folio, page); - else - mapcount = folio_average_page_mapcount(folio); - } - - if (put_user(mapcount, out)) { + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + info = folio_precise_page_mapcount(page_folio(page), page); + else + info = folio_average_page_mapcount(page_folio(page)); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -94,17 +108,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } +/* /proc/kpagecount - an array exposing page mapcounts + * + * Each entry is a u64 representing the corresponding + * physical page mapcount. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + static const struct proc_ops kpagecount_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { @@ -225,47 +245,17 @@ u64 stable_page_flags(const struct page *page) #endif return u; -}; +} +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - struct page *page = pfn_to_online_page(pfn); - - if (put_user(stable_page_flags(page), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } static const struct proc_ops kpageflags_proc_ops = { @@ -276,53 +266,10 @@ static const struct proc_ops kpageflags_proc_ops = { #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - static const struct proc_ops kpagecgroup_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4..27972c0749e7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -212,8 +212,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -1325,8 +1325,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -2069,8 +2069,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -2087,7 +2087,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2128,12 +2129,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index bce674533000..59bfd61d653a 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 89d2dbbb742c..5200a0f3cafc 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -155,6 +155,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fids *cfids; const char *npath; int retries = 0, cur_sleep = 1; + __le32 lease_flags = 0; if (cifs_sb->root == NULL) return -ENOENT; @@ -201,6 +202,8 @@ replay_again: } spin_unlock(&cfids->cfid_list_lock); + pfid = &cfid->fid; + /* * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up * calling ->lookup() which already adds those through @@ -222,6 +225,25 @@ replay_again: rc = -ENOENT; goto out; } + if (dentry->d_parent && server->dialect >= SMB30_PROT_ID) { + struct cached_fid *parent_cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &cfids->entries, entry) { + if (parent_cfid->dentry == dentry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(pfid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + } + break; + } + } + spin_unlock(&cfids->cfid_list_lock); + } } cfid->dentry = dentry; cfid->tcon = tcon; @@ -236,7 +258,6 @@ replay_again: if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -256,6 +277,7 @@ replay_again: FILE_READ_EA, .disposition = FILE_OPEN, .fid = pfid, + .lease_flags = lease_flags, .replay = !!(retries), }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index fb04e263611c..0a5266ecfd15 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -70,7 +70,6 @@ bool require_gcm_256; /* false by default */ bool enable_negotiate_signing; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ -unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 3b32116b0b49..ad7dd16db3e9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -556,7 +556,7 @@ struct smb_version_operations { void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, bool *purge_cache); /* create lease context buffer for CREATE request */ - char * (*create_lease_buf)(u8 *lease_key, u8 oplock); + char * (*create_lease_buf)(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 le_flags); /* parse lease context buffer and return oplock/epoch info */ __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, @@ -773,6 +773,7 @@ struct TCP_Server_Info { char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ __u32 reconnect_instance; /* incremented on each reconnect */ + __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ @@ -1441,6 +1442,7 @@ struct cifs_open_parms { bool reconnect:1; bool replay:1; /* indicates that this open is for a replay */ struct kvec *ea_cctx; + __le32 lease_flags; }; struct cifs_fid { @@ -1448,6 +1450,7 @@ struct cifs_fid { __u64 persistent_fid; /* persist file id for smb2 */ __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ + __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; @@ -1988,8 +1991,7 @@ require use of the stronger protocol */ * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session * reconnect_mutex * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session - * cifs_ses->session_mutex cifs_ses sesInfoAlloc - * cifs_tcon + * cifs_ses->session_mutex cifs_ses sesInfoAlloc * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc * cifs_tcon->pending_opens * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc @@ -2008,21 +2010,25 @@ require use of the stronger protocol */ * ->oplock_credits * ->reconnect_instance * cifs_ses->ses_lock (anything that is not protected by another lock and can change) + * sesInfoAlloc * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc * ->iface_count * ->iface_last_update - * cifs_ses->chan_lock cifs_ses->chans + * cifs_ses->chan_lock cifs_ses->chans sesInfoAlloc * ->chans_need_reconnect * ->chans_in_reconnect * cifs_tcon->tc_lock (anything that is not protected by another lock and can change) + * tcon_info_alloc * inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs - * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo + * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs + * cached_fid->fid_lock (anything that is not protected by another lock and can change) + * init_cached_dir + * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 1b79fe07476f..d9cf7db0ac35 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -597,7 +597,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 SecurityBlobLength; __u32 Reserved; __le32 Capabilities; /* see below */ @@ -616,7 +616,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 CaseInsensitivePasswordLength; /* ASCII password len */ __le16 CaseSensitivePasswordLength; /* Unicode password length*/ __u32 Reserved; /* see below */ @@ -654,7 +654,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index ecf774a8f1ca..66093fa78aed 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -151,8 +151,7 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof, bool from_readdir); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async); +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index f55457b4b82e..7216fcec79e8 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -498,6 +498,7 @@ CIFSSMBNegotiate(const unsigned int xid, server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->session_key_id = pSMBr->SessionKey; server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; @@ -1725,7 +1726,7 @@ cifs_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; - cifs_write_subrequest_terminated(wdata, result, true); + cifs_write_subrequest_terminated(wdata, result); release_mid(mid); trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0, server->credits, server->in_flight, @@ -1813,7 +1814,7 @@ async_writev_out: out: if (rc) { add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -2753,10 +2754,10 @@ int cifs_query_reparse_point(const unsigned int xid, io_req->TotalParameterCount = 0; io_req->TotalDataCount = 0; - io_req->MaxParameterCount = cpu_to_le32(2); + io_req->MaxParameterCount = cpu_to_le32(0); /* BB find exact data count max from sess structure BB */ io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - io_req->MaxSetupCount = 4; + io_req->MaxSetupCount = 1; io_req->Reserved = 0; io_req->ParameterOffset = 0; io_req->DataCount = 0; @@ -2783,6 +2784,22 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } + /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */ + if (io_rsp->SetupCount != 1) { + rc = -EIO; + goto error; + } + + /* + * ReturnedDataLen is output length of executed IOCTL. + * DataCount is output length transferred over network. + * Check that we have full FSCTL_GET_REPARSE_POINT buffer. + */ + if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) { + rc = -EIO; + goto error; + } + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; if (start >= end) { diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 6bf04d9a5491..024817d40c5f 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -377,6 +377,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, if (!cifs_tcp_ses_needs_reconnect(server, 1)) return 0; + /* + * if smb session has been marked for reconnect, also reconnect all + * connections. This way, the other connections do not end up bad. + */ + if (mark_smb_session) + cifs_signal_cifsd_for_reconnect(server, mark_smb_session); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); cifs_abort_connection(server); @@ -385,7 +392,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, try_to_freeze(); cifs_server_lock(server); - if (!cifs_swn_set_server_dstaddr(server)) { + if (!cifs_swn_set_server_dstaddr(server) && + !SERVER_IS_CHAN(server)) { /* resolve the hostname again to make sure that IP address is up-to-date */ rc = reconn_set_ipaddr_from_hostname(server); cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index d1e95632ac54..1c6e5389c51f 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -23,6 +23,7 @@ #include "fs_context.h" #include "cifs_ioctl.h" #include "fscache.h" +#include "cached_dir.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -190,6 +191,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; int rdwr_for_fscache = 0; + __le32 lease_flags = 0; *oplock = 0; if (tcon->ses->server->oplocks) @@ -312,6 +314,26 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned create_options |= CREATE_OPTION_READONLY; retry_open: + if (tcon->cfids && direntry->d_parent && server->dialect >= SMB30_PROT_ID) { + struct cached_fid *parent_cfid; + + spin_lock(&tcon->cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) { + if (parent_cfid->dentry == direntry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(fid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + } + break; + } + } + spin_unlock(&tcon->cfids->cfid_list_lock); + } + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -320,6 +342,7 @@ retry_open: .disposition = disposition, .path = full_path, .fid = fid, + .lease_flags = lease_flags, .mode = mode, }; rc = server->ops->open(xid, &oparms, oplock, buf); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 950aa4f912f5..d2df10b8e6fd 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -130,7 +130,7 @@ fail: else trace_netfs_sreq(subreq, netfs_sreq_trace_fail); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); goto out; } @@ -219,7 +219,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) goto failed; } - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); @@ -2423,8 +2424,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) return rc; } -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async) +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; struct netfs_inode *ictx = netfs_inode(wreq->inode); @@ -2441,7 +2441,7 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t netfs_resize_file(ictx, wrend, true); } - netfs_write_subrequest_terminated(&wdata->subreq, result, was_async); + netfs_write_subrequest_terminated(&wdata->subreq, result); } struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 7b6ed9b23e71..e77017f47084 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -326,6 +326,14 @@ check_smb_hdr(struct smb_hdr *smb) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; + /* + * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING + * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other) + * for some TRANS2 requests without the RESPONSE flag set in header. + */ + if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0) + return 0; + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", get_mid(smb)); return 1; diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index e3f9213131c4..52a520349cb7 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -146,6 +146,9 @@ static char *automount_fullpath(struct dentry *dentry, void *page) } spin_unlock(&tcon->tc_lock); + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); + s = dentry_path_raw(dentry, page, PATH_MAX); if (IS_ERR(s)) return s; @@ -283,7 +286,6 @@ struct vfsmount *cifs_d_automount(struct path *path) return newmnt; } - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &cifs_automount_list); schedule_delayed_work(&cifs_automount_task, cifs_mountpoint_expiry_timeout); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index b3fa9ee26912..ec0db32c7d98 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -445,6 +445,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) ses->chans[chan_index].iface = iface; spin_unlock(&ses->chan_lock); + + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); } static int @@ -628,6 +632,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq); pSMB->req.VcNumber = cpu_to_le16(1); + pSMB->req.SessionKey = server->session_key_id; /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -1684,22 +1689,22 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; capabilities = cifs_ssetup_hdr(ses, server, pSMB); - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - return -ENOSYS; - } - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities |= cpu_to_le32(capabilities); bcc_ptr = sess_data->iov[2].iov_base; - /* unicode strings must be word aligned */ - if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { - *bcc_ptr = 0; - bcc_ptr++; + + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { + /* unicode strings must be word aligned */ + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + } else { + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); } - unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); sess_data->iov[2].iov_len = (long) bcc_ptr - (long) sess_data->iov[2].iov_base; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2fe8eeb98535..bab9f567d9b7 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4069,7 +4069,7 @@ map_oplock_to_lease(u8 oplock) } static char * -smb2_create_lease_buf(u8 *lease_key, u8 oplock) +smb2_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease *buf; @@ -4095,7 +4095,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) } static char * -smb3_create_lease_buf(u8 *lease_key, u8 oplock) +smb3_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease_v2 *buf; @@ -4105,6 +4105,9 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + buf->lcontext.LeaseFlags = flags; + if (flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) + memcpy(&buf->lcontext.ParentLeaseKey, parent_lease_key, SMB2_LEASE_KEY_SIZE); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_lease_v2, lcontext)); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4e28632b5fd6..0c320d06809c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2392,11 +2392,16 @@ static int add_lease_context(struct TCP_Server_Info *server, struct smb2_create_req *req, struct kvec *iov, - unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) + unsigned int *num_iovec, + u8 *lease_key, + __u8 *oplock, + u8 *parent_lease_key, + __le32 flags) { unsigned int num = *num_iovec; - iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock, + parent_lease_key, flags); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; @@ -3069,7 +3074,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->RequestedOplockLevel = *oplock; /* no srv lease support */ else { rc = add_lease_context(server, req, iov, &n_iov, - oparms->fid->lease_key, oplock); + oparms->fid->lease_key, oplock, + oparms->fid->parent_lease_key, + oparms->lease_flags); if (rc) return rc; } @@ -4888,7 +4895,7 @@ smb2_writev_callback(struct mid_q_entry *mid) 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); - cifs_write_subrequest_terminated(wdata, result ?: written, true); + cifs_write_subrequest_terminated(wdata, result ?: written); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, server->credits, server->in_flight, @@ -5061,7 +5068,7 @@ out: -(int)wdata->credits.value, cifs_trace_rw_credits_write_response_clear); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, true); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -5917,71 +5924,6 @@ posix_qfsinf_exit: } int -SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) -{ - struct smb_rqst rqst; - struct smb2_query_info_rsp *rsp = NULL; - struct kvec iov; - struct kvec rsp_iov; - int rc = 0; - int resp_buftype; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server; - struct smb2_fs_full_size_info *info = NULL; - int flags = 0; - int retries = 0, cur_sleep = 1; - -replay_again: - /* reinitialize for possible replay */ - flags = 0; - server = cifs_pick_channel(ses); - - rc = build_qfs_info_req(&iov, tcon, server, - FS_FULL_SIZE_INFORMATION, - sizeof(struct smb2_fs_full_size_info), - persistent_fid, volatile_fid); - if (rc) - return rc; - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = &iov; - rqst.rq_nvec = 1; - - if (retries) - smb2_set_replay(server, &rqst); - - rc = cifs_send_recv(xid, ses, server, - &rqst, &resp_buftype, flags, &rsp_iov); - free_qfs_info_req(&iov); - if (rc) { - cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); - goto qfsinf_exit; - } - rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - - info = (struct smb2_fs_full_size_info *)( - le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); - rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, - sizeof(struct smb2_fs_full_size_info)); - if (!rc) - smb2_copy_fs_info_to_kstatfs(info, fsdata); - -qfsinf_exit: - free_rsp_buf(resp_buftype, rsp_iov.iov_base); - - if (is_replayable_error(rc) && - smb2_should_replay(tcon, &retries, &cur_sleep)) - goto replay_again; - - return rc; -} - -int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 4662c7e2d259..035aa1624053 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -259,9 +259,6 @@ extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 volatile_fid); extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); -extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_file_id, u64 volatile_file_id, - struct kstatfs *FSData); extern int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b1091e70434a..a9602aae21ef 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -149,6 +149,27 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_COMP_CACHE_FULL + bool "Enable full caching of compressed blocks" + depends on SQUASHFS + default n + help + This option enables caching of all compressed blocks, Without caching, + repeated reads of the same files trigger excessive disk I/O, significantly + reducinng performance in workloads like fio-based benchmarks. + + For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show: + With caching: IOPS=2223, BW=278MiB/s (291MB/s) + Without caching: IOPS=815, BW=102MiB/s (107MB/s) + + Enabling this option restores performance to pre-regression levels by + caching all compressed blocks in the page cache, reducing disk I/O for + repeated reads. However, this increases memory usage, which may be a + concern in memory-constrained environments. + + Enable this option if your workload involves frequent repeated reads and + memory usage is not a limiting factor. If unsure, say N. + config SQUASHFS_ZLIB bool "Include support for ZLIB compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2dc730800f44..3061043e915c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -88,6 +88,10 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct bio_vec *bv; int idx = 0; int err = 0; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + struct page **cache_pages = kmalloc_array(page_count, + sizeof(void *), GFP_KERNEL | __GFP_ZERO); +#endif bio_for_each_segment_all(bv, fullbio, iter_all) { struct page *page = bv->bv_page; @@ -110,6 +114,11 @@ static int squashfs_bio_read_cached(struct bio *fullbio, head_to_cache = page; else if (idx == page_count - 1 && index + length != read_end) tail_to_cache = page; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + /* Cache all pages in the BIO for repeated reads */ + else if (cache_pages) + cache_pages[idx] = page; +#endif if (!bio || idx != end_idx) { struct bio *new = bio_alloc_clone(bdev, fullbio, @@ -163,6 +172,25 @@ static int squashfs_bio_read_cached(struct bio *fullbio, } } +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + if (!cache_pages) + goto out; + + for (idx = 0; idx < page_count; idx++) { + if (!cache_pages[idx]) + continue; + int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + (read_start >> PAGE_SHIFT) + idx, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(cache_pages[idx]); + unlock_page(cache_pages[idx]); + } + } + kfree(cache_pages); +out: +#endif return 0; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 67c55fe32ce8..992ea0e37257 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); diff --git a/fs/super.c b/fs/super.c index bcc4e87123c8..21799e213fd7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -824,13 +824,6 @@ struct super_block *sget(struct file_system_type *type, struct super_block *old; int err; - /* We don't yet pass the user namespace of the parent - * mount through to here so always use &init_user_ns - * until that changes. - */ - if (flags & SB_SUBMOUNT) - user_ns = &init_user_ns; - retry: spin_lock(&sb_lock); if (test) { @@ -850,7 +843,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 762699c1bcf6..eea718ac66b4 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -83,11 +83,11 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/log2.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/iversion.h> @@ -289,7 +289,7 @@ void ufs_error (struct super_block * sb, const char * function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { + switch (UFS_SB(sb)->s_on_err) { case UFS_MOUNT_ONERROR_PANIC: panic("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); @@ -342,124 +342,74 @@ void ufs_warning (struct super_block * sb, const char * function, va_end(args); } -enum { - Opt_type_old = UFS_MOUNT_UFSTYPE_OLD, - Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86, - Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN, - Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS, - Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD, - Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2, - Opt_type_hp = UFS_MOUNT_UFSTYPE_HP, - Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD, - Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP, - Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP, - Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC, - Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK, - Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT, - Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR, - Opt_err +enum { Opt_type, Opt_onerror }; + +static const struct constant_table ufs_param_ufstype[] = { + {"old", UFS_MOUNT_UFSTYPE_OLD}, + {"sunx86", UFS_MOUNT_UFSTYPE_SUNx86}, + {"sun", UFS_MOUNT_UFSTYPE_SUN}, + {"sunos", UFS_MOUNT_UFSTYPE_SUNOS}, + {"44bsd", UFS_MOUNT_UFSTYPE_44BSD}, + {"ufs2", UFS_MOUNT_UFSTYPE_UFS2}, + {"5xbsd", UFS_MOUNT_UFSTYPE_UFS2}, + {"hp", UFS_MOUNT_UFSTYPE_HP}, + {"nextstep-cd", UFS_MOUNT_UFSTYPE_NEXTSTEP_CD}, + {"nextstep", UFS_MOUNT_UFSTYPE_NEXTSTEP}, + {"openstep", UFS_MOUNT_UFSTYPE_OPENSTEP}, + {} }; -static const match_table_t tokens = { - {Opt_type_old, "ufstype=old"}, - {Opt_type_sunx86, "ufstype=sunx86"}, - {Opt_type_sun, "ufstype=sun"}, - {Opt_type_sunos, "ufstype=sunos"}, - {Opt_type_44bsd, "ufstype=44bsd"}, - {Opt_type_ufs2, "ufstype=ufs2"}, - {Opt_type_ufs2, "ufstype=5xbsd"}, - {Opt_type_hp, "ufstype=hp"}, - {Opt_type_nextstepcd, "ufstype=nextstep-cd"}, - {Opt_type_nextstep, "ufstype=nextstep"}, - {Opt_type_openstep, "ufstype=openstep"}, -/*end of possible ufs types */ - {Opt_onerror_panic, "onerror=panic"}, - {Opt_onerror_lock, "onerror=lock"}, - {Opt_onerror_umount, "onerror=umount"}, - {Opt_onerror_repair, "onerror=repair"}, - {Opt_err, NULL} +static const struct constant_table ufs_param_onerror[] = { + {"panic", UFS_MOUNT_ONERROR_PANIC}, + {"lock", UFS_MOUNT_ONERROR_LOCK}, + {"umount", UFS_MOUNT_ONERROR_UMOUNT}, + {"repair", UFS_MOUNT_ONERROR_REPAIR}, + {} }; -static int ufs_parse_options (char * options, unsigned * mount_options) +static const struct fs_parameter_spec ufs_param_spec[] = { + fsparam_enum ("ufstype", Opt_type, ufs_param_ufstype), + fsparam_enum ("onerror", Opt_onerror, ufs_param_onerror), + {} +}; + +struct ufs_fs_context { + unsigned int flavour, on_err; +}; + +static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char * p; - + struct ufs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + UFSD("ENTER\n"); - - if (!options) - return 1; - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; + opt = fs_parse(fc, ufs_param_spec, param, &result); + if (opt < 0) + return opt; - token = match_token(p, tokens, args); - switch (token) { - case Opt_type_old: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_OLD); - break; - case Opt_type_sunx86: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_SUNx86); - break; - case Opt_type_sun: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_SUN); - break; - case Opt_type_sunos: - ufs_clear_opt(*mount_options, UFSTYPE); - ufs_set_opt(*mount_options, UFSTYPE_SUNOS); - break; - case Opt_type_44bsd: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_44BSD); - break; - case Opt_type_ufs2: - ufs_clear_opt(*mount_options, UFSTYPE); - ufs_set_opt(*mount_options, UFSTYPE_UFS2); - break; - case Opt_type_hp: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_HP); - break; - case Opt_type_nextstepcd: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD); - break; - case Opt_type_nextstep: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP); - break; - case Opt_type_openstep: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP); - break; - case Opt_onerror_panic: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_PANIC); - break; - case Opt_onerror_lock: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_LOCK); - break; - case Opt_onerror_umount: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_UMOUNT); - break; - case Opt_onerror_repair: - pr_err("Unable to do repair on error, will lock lock instead\n"); - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_REPAIR); - break; - default: - pr_err("Invalid option: \"%s\" or missing value\n", p); + switch (opt) { + case Opt_type: + if (ctx->flavour == result.uint_32) /* no-op */ return 0; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + pr_err("ufstype can't be changed during remount\n"); + return -EINVAL; } + if (!ctx->flavour) { + pr_err("conflicting ufstype options\n"); + return -EINVAL; + } + ctx->flavour = result.uint_32; + break; + case Opt_onerror: + ctx->on_err = result.uint_32; + break; + default: + return -EINVAL; } - return 1; + return 0; } /* @@ -474,7 +424,7 @@ static void ufs_setup_cstotal(struct super_block *sb) struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; - unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + unsigned mtype = sbi->s_flavour; UFSD("ENTER, mtype=%u\n", mtype); usb1 = ubh_get_usb_first(uspi); @@ -580,7 +530,7 @@ failed: */ static void ufs_put_cstotal(struct super_block *sb) { - unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + unsigned mtype = UFS_SB(sb)->s_flavour; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; @@ -764,8 +714,10 @@ static u64 ufs_max_bytes(struct super_block *sb) return res << uspi->s_bshift; } -static int ufs_fill_super(struct super_block *sb, void *data, int silent) +static int ufs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct ufs_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; @@ -803,24 +755,18 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); - /* - * Set default mount options - * Parse mount options - */ - sbi->s_mount_opt = 0; - ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { - pr_err("wrong mount options\n"); - goto failed; - } - if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { + + sbi->s_flavour = ctx->flavour; + sbi->s_on_err = ctx->on_err; + + if (!sbi->s_flavour) { if (!silent) pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " "default is ufstype=old\n"); - ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD); + sbi->s_flavour = UFS_MOUNT_UFSTYPE_OLD; } uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); @@ -836,7 +782,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_min = S32_MIN; sb->s_time_max = S32_MAX; - switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { + switch (sbi->s_flavour) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); uspi->s_fsize = block_size = 512; @@ -1035,9 +981,9 @@ again: goto magic_found; } - if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) - || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) - || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) + if ((sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP + || sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD + || sbi->s_flavour == UFS_MOUNT_UFSTYPE_OPENSTEP) && uspi->s_sbbase < 256) { ubh_brelse_uspi(uspi); ubh = NULL; @@ -1237,8 +1183,8 @@ magic_found: uspi->s_bpf = uspi->s_fsize << 3; uspi->s_bpfshift = uspi->s_fshift + 3; uspi->s_bpfmask = uspi->s_bpf - 1; - if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD || - (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2) + if (sbi->s_flavour == UFS_MOUNT_UFSTYPE_44BSD || + sbi->s_flavour == UFS_MOUNT_UFSTYPE_UFS2) uspi->s_maxsymlinklen = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); @@ -1290,13 +1236,15 @@ failed_nomem: return -ENOMEM; } -static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) +static int ufs_reconfigure(struct fs_context *fc) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; - unsigned new_mount_opt, ufstype; - unsigned flags; + struct ufs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + unsigned int ufstype; + unsigned int flags; sync_filesystem(sb); mutex_lock(&UFS_SB(sb)->s_lock); @@ -1305,27 +1253,10 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - /* - * Allow the "check" option to be passed as a remount option. - * It is not possible to change ufstype option during remount - */ - ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; - new_mount_opt = 0; - ufs_set_opt (new_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options (data, &new_mount_opt)) { - mutex_unlock(&UFS_SB(sb)->s_lock); - return -EINVAL; - } - if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { - new_mount_opt |= ufstype; - } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { - pr_err("ufstype can't be changed during remount\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); - return -EINVAL; - } + ufstype = UFS_SB(sb)->s_flavour; - if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) { - UFS_SB(sb)->s_mount_opt = new_mount_opt; + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) { + UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1333,7 +1264,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) /* * fs was mouted as rw, remounting ro */ - if (*mount_flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN @@ -1369,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sb->s_flags &= ~SB_RDONLY; #endif } - UFS_SB(sb)->s_mount_opt = new_mount_opt; + UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1377,19 +1308,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) static int ufs_show_options(struct seq_file *seq, struct dentry *root) { struct ufs_sb_info *sbi = UFS_SB(root->d_sb); - unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - const struct match_token *tp = tokens; + unsigned mval = sbi->s_flavour; + const struct constant_table *tp; - while (tp->token != Opt_onerror_panic && tp->token != mval) + tp = ufs_param_ufstype; + while (tp->value && tp->value != mval) ++tp; - BUG_ON(tp->token == Opt_onerror_panic); - seq_printf(seq, ",%s", tp->pattern); + seq_printf(seq, ",ufstype=%s", tp->name); - mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR; - while (tp->token != Opt_err && tp->token != mval) + tp = ufs_param_onerror; + mval = sbi->s_on_err; + while (tp->value && tp->value != mval) ++tp; - BUG_ON(tp->token == Opt_err); - seq_printf(seq, ",%s", tp->pattern); + seq_printf(seq, ",onerror=%s", tp->name); return 0; } @@ -1483,21 +1414,57 @@ static const struct super_operations ufs_super_ops = { .put_super = ufs_put_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, - .remount_fs = ufs_remount, .show_options = ufs_show_options, }; -static struct dentry *ufs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ufs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ufs_fill_super); +} + +static void ufs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations ufs_context_ops = { + .parse_param = ufs_parse_param, + .get_tree = ufs_get_tree, + .reconfigure = ufs_reconfigure, + .free = ufs_free_fc, +}; + +static int ufs_init_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); + struct ufs_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct ufs_sb_info *sbi = UFS_SB(sb); + + ctx->flavour = sbi->s_flavour; + ctx->on_err = sbi->s_on_err; + } else { + ctx->flavour = 0; + ctx->on_err = UFS_MOUNT_ONERROR_LOCK; + } + + fc->fs_private = ctx; + fc->ops = &ufs_context_ops; + + return 0; } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", - .mount = ufs_mount, .kill_sb = kill_block_super, + .init_fs_context = ufs_init_fs_context, + .parameters = ufs_param_spec, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ufs"); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index e7df65dd4351..788e025056b2 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,7 +24,8 @@ struct ufs_sb_info { struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED]; unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; - unsigned s_mount_opt; + unsigned s_flavour; + unsigned s_on_err; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ @@ -52,13 +53,11 @@ struct ufs_inode_info { }; /* mount options */ -#define UFS_MOUNT_ONERROR 0x0000000F #define UFS_MOUNT_ONERROR_PANIC 0x00000001 #define UFS_MOUNT_ONERROR_LOCK 0x00000002 #define UFS_MOUNT_ONERROR_UMOUNT 0x00000004 #define UFS_MOUNT_ONERROR_REPAIR 0x00000008 -#define UFS_MOUNT_UFSTYPE 0x0000FFF0 #define UFS_MOUNT_UFSTYPE_OLD 0x00000010 #define UFS_MOUNT_UFSTYPE_44BSD 0x00000020 #define UFS_MOUNT_UFSTYPE_SUN 0x00000040 @@ -70,10 +69,6 @@ struct ufs_inode_info { #define UFS_MOUNT_UFSTYPE_UFS2 0x00001000 #define UFS_MOUNT_UFSTYPE_SUNOS 0x00002000 -#define ufs_clear_opt(o,opt) o &= ~UFS_MOUNT_##opt -#define ufs_set_opt(o,opt) o |= UFS_MOUNT_##opt -#define ufs_test_opt(o,opt) ((o) & UFS_MOUNT_##opt) - /* * Debug code */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 26a04a783489..63151feb9c3f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -436,6 +436,25 @@ allocate_blocks: return 0; } +static bool +xfs_ioend_needs_wq_completion( + struct iomap_ioend *ioend) +{ + /* Changing inode size requires a transaction. */ + if (xfs_ioend_is_append(ioend)) + return true; + + /* Extent manipulation requires a transaction. */ + if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) + return true; + + /* Page cache invalidation cannot be done in irq context. */ + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) + return true; + + return false; +} + static int xfs_submit_ioend( struct iomap_writepage_ctx *wpc, @@ -460,8 +479,7 @@ xfs_submit_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || - (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) + if (xfs_ioend_needs_wq_completion(ioend)) ioend->io_bio.bi_end_io = xfs_end_bio; if (status) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index d613a4094db6..9c00fc5baa30 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -290,8 +290,6 @@ xfs_zone_gc_query_cb( return 0; } -#define cmp_int(l, r) ((l > r) - (l < r)) - static int xfs_zone_gc_rmap_rec_cmp( const void *a, |