diff options
Diffstat (limited to 'fs')
90 files changed, 1420 insertions, 1072 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f3248a3e5402..c1acbc98465d 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -66,7 +66,6 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)  	struct p9_fid *fid;  	struct inode *inode;  	struct v9fs_inode *v9inode; -	unsigned int cached;  	if (flags & LOOKUP_RCU)  		return -ECHILD; @@ -76,11 +75,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)  		goto out_valid;  	v9inode = V9FS_I(inode); -	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - -	cached = v9ses->cache & (CACHE_META | CACHE_LOOSE); - -	if (!cached || v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { +	if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {  		int retval;  		struct v9fs_session_info *v9ses; @@ -114,6 +109,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)  			p9_debug(P9_DEBUG_VFS,  				"refresh inode: dentry = %pd (%p), got error %pe\n",  				dentry, dentry, ERR_PTR(retval)); +		if (retval < 0)  			return retval;  		}  	} @@ -150,8 +146,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {  };  const struct dentry_operations v9fs_dentry_operations = { -	.d_revalidate = v9fs_lookup_revalidate, -	.d_weak_revalidate = __v9fs_lookup_revalidate,  	.d_release = v9fs_dentry_release,  	.d_unalias_trylock = v9fs_dentry_unalias_trylock,  	.d_unalias_unlock = v9fs_dentry_unalias_unlock, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 69f378a83775..d0c77ec31b1d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1339,14 +1339,8 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)  	 * Don't update inode if the file type is different  	 */  	umode = p9mode2unixmode(v9ses, st, &rdev); -	if (inode_wrong_type(inode, umode)) { -		/* -		 * Do this as a way of letting the caller know the inode should not -		 * be reused -		 */ -		v9fs_invalidate_inode_attr(inode); +	if (inode_wrong_type(inode, umode))  		goto out; -	}  	/*  	 * We don't want to refresh inode->i_size, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0b404e8484d2..be297e335468 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -897,14 +897,8 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)  	/*  	 * Don't update inode if the file type is different  	 */ -	if (inode_wrong_type(inode, st->st_mode)) { -		/* -		 * Do this as a way of letting the caller know the inode should not -		 * be reused -		 */ -		v9fs_invalidate_inode_attr(inode); +	if (inode_wrong_type(inode, st->st_mode))  		goto out; -	}  	/*  	 * We don't want to refresh inode->i_size, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 41e37f7f67cc..3df7b9d7fbe8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -2110,9 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)  		for (int i = 0; i < count; i++) {  			__btrfs_kill_delayed_node(delayed_nodes[i]); +			btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);  			btrfs_release_delayed_node(delayed_nodes[i],  						   &delayed_node_trackers[i]); -			btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);  		}  	}  } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0d949edc0caf..b09d4ec8c77d 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -219,6 +219,13 @@ static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed  	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))  		return; +	/* +	 * Only print if there are leaked references. The caller is +	 * holding one reference, so if refs == 1 there is no leak. +	 */ +	if (refcount_read(&node->refs) == 1) +		return; +  	ref_tracker_dir_print(&node->ref_dir.dir,  			      BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);  } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..755ec6dfd51c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -973,7 +973,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,  {  	const u64 ra_pos = readahead_pos(ractl);  	const u64 ra_end = ra_pos + readahead_length(ractl); -	const u64 em_end = em->start + em->ram_bytes; +	const u64 em_end = em->start + em->len;  	/* No expansion for holes and inline extents. */  	if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dad0b492a663..d86541073d42 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1106,14 +1106,15 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,  	 * If ret is 1 (no key found), it means this is an empty block group,  	 * without any extents allocated from it and there's no block group  	 * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree -	 * because we are using the block group tree feature, so block group -	 * items are stored in the block group tree. It also means there are no -	 * extents allocated for block groups with a start offset beyond this -	 * block group's end offset (this is the last, highest, block group). +	 * because we are using the block group tree feature (so block group +	 * items are stored in the block group tree) or this is a new block +	 * group created in the current transaction and its block group item +	 * was not yet inserted in the extent tree (that happens in +	 * btrfs_create_pending_block_groups() -> insert_block_group_item()). +	 * It also means there are no extents allocated for block groups with a +	 * start offset beyond this block group's end offset (this is the last, +	 * highest, block group).  	 */ -	if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) -		ASSERT(ret == 0); -  	start = block_group->start;  	end = block_group->start + block_group->length;  	while (ret == 0) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 185bef0df1c2..8cb7d5a462ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3740,7 +3740,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)  		prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);  		if (!prealloc) {  			ret = -ENOMEM; -			goto drop_write; +			goto out;  		}  	} diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index de4cb0f3fbd0..e9224145d754 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -982,7 +982,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)  	extent_root = btrfs_extent_root(fs_info, 0);  	/* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ -	if (IS_ERR(extent_root)) { +	if (!extent_root) {  		btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");  		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);  		return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8dd8de6b9fb8..0765e06d00b8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3780,6 +3780,7 @@ out:  /*   * Mark start of chunk relocation that is cancellable. Check if the cancellation   * has been requested meanwhile and don't start in that case. + * NOTE: if this returns an error, reloc_chunk_end() must not be called.   *   * Return:   *   0             success @@ -3796,10 +3797,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)  	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {  		btrfs_info(fs_info, "chunk relocation canceled on start"); -		/* -		 * On cancel, clear all requests but let the caller mark -		 * the end after cleanup operations. -		 */ +		/* On cancel, clear all requests. */ +		clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);  		atomic_set(&fs_info->reloc_cancel_req, 0);  		return -ECANCELED;  	} @@ -3808,9 +3807,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)  /*   * Mark end of chunk relocation that is cancellable and wake any waiters. + * NOTE: call only if a previous call to reloc_chunk_start() succeeded.   */  static void reloc_chunk_end(struct btrfs_fs_info *fs_info)  { +	ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags));  	/* Requested after start, clear bit first so any waiters can continue */  	if (atomic_read(&fs_info->reloc_cancel_req) > 0)  		btrfs_info(fs_info, "chunk relocation canceled during operation"); @@ -4023,9 +4024,9 @@ out:  	if (err && rw)  		btrfs_dec_block_group_ro(rc->block_group);  	iput(rc->data_inode); +	reloc_chunk_end(fs_info);  out_put_bg:  	btrfs_put_block_group(bg); -	reloc_chunk_end(fs_info);  	free_reloc_control(rc);  	return err;  } @@ -4208,8 +4209,8 @@ out_clean:  		ret = ret2;  out_unset:  	unset_reloc_control(rc); -out_end:  	reloc_chunk_end(fs_info); +out_end:  	free_reloc_control(rc);  out:  	free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4691d0bdb2e8..651b11884f82 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -694,7 +694,7 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)  	/* stripe->folios[] is allocated by us and no highmem is allowed. */  	ASSERT(folio); -	ASSERT(!folio_test_partial_kmap(folio)); +	ASSERT(!folio_test_highmem(folio));  	return folio_address(folio) + offset_in_folio(folio, offset);  } @@ -707,7 +707,7 @@ static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int secto  	/* stripe->folios[] is allocated by us and no highmem is allowed. */  	ASSERT(folio); -	ASSERT(!folio_test_partial_kmap(folio)); +	ASSERT(!folio_test_highmem(folio));  	/* And the range must be contained inside the folio. */  	ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));  	return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9230e5066fc6..96a030d28e09 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -178,7 +178,6 @@ struct send_ctx {  	u64 cur_inode_rdev;  	u64 cur_inode_last_extent;  	u64 cur_inode_next_write_offset; -	struct fs_path cur_inode_path;  	bool cur_inode_new;  	bool cur_inode_new_gen;  	bool cur_inode_deleted; @@ -305,6 +304,9 @@ struct send_ctx {  	struct btrfs_lru_cache dir_created_cache;  	struct btrfs_lru_cache dir_utimes_cache; + +	/* Must be last as it ends in a flexible-array member. */ +	struct fs_path cur_inode_path;  };  struct pending_dir_move { @@ -4100,6 +4102,48 @@ out:  	return ret;  } +static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) +{ +	const struct recorded_ref *data = k; +	const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + +	if (data->dir > ref->dir) +		return 1; +	if (data->dir < ref->dir) +		return -1; +	if (data->dir_gen > ref->dir_gen) +		return 1; +	if (data->dir_gen < ref->dir_gen) +		return -1; +	return 0; +} + +static bool rbtree_check_dir_ref_less(struct rb_node *node, const struct rb_node *parent) +{ +	const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + +	return rbtree_check_dir_ref_comp(entry, parent) < 0; +} + +static int record_check_dir_ref_in_tree(struct rb_root *root, +			struct recorded_ref *ref, struct list_head *list) +{ +	struct recorded_ref *tmp_ref; +	int ret; + +	if (rb_find(ref, root, rbtree_check_dir_ref_comp)) +		return 0; + +	ret = dup_ref(ref, list); +	if (ret < 0) +		return ret; + +	tmp_ref = list_last_entry(list, struct recorded_ref, list); +	rb_add(&tmp_ref->node, root, rbtree_check_dir_ref_less); +	tmp_ref->root = root; +	return 0; +} +  static int rename_current_inode(struct send_ctx *sctx,  				struct fs_path *current_path,  				struct fs_path *new_path) @@ -4127,11 +4171,11 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  	struct recorded_ref *cur;  	struct recorded_ref *cur2;  	LIST_HEAD(check_dirs); +	struct rb_root rbtree_check_dirs = RB_ROOT;  	struct fs_path *valid_path = NULL;  	u64 ow_inode = 0;  	u64 ow_gen;  	u64 ow_mode; -	u64 last_dir_ino_rm = 0;  	bool did_overwrite = false;  	bool is_orphan = false;  	bool can_rename = true; @@ -4435,7 +4479,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  					goto out;  			}  		} -		ret = dup_ref(cur, &check_dirs); +		ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);  		if (ret < 0)  			goto out;  	} @@ -4463,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  		}  		list_for_each_entry(cur, &sctx->deleted_refs, list) { -			ret = dup_ref(cur, &check_dirs); +			ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);  			if (ret < 0)  				goto out;  		} @@ -4473,7 +4517,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  		 * We have a moved dir. Add the old parent to check_dirs  		 */  		cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); -		ret = dup_ref(cur, &check_dirs); +		ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);  		if (ret < 0)  			goto out;  	} else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -4507,7 +4551,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  				if (is_current_inode_path(sctx, cur->full_path))  					fs_path_reset(&sctx->cur_inode_path);  			} -			ret = dup_ref(cur, &check_dirs); +			ret = record_check_dir_ref_in_tree(&rbtree_check_dirs, cur, &check_dirs);  			if (ret < 0)  				goto out;  		} @@ -4550,8 +4594,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  			ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);  			if (ret < 0)  				goto out; -		} else if (ret == inode_state_did_delete && -			   cur->dir != last_dir_ino_rm) { +		} else if (ret == inode_state_did_delete) {  			ret = can_rmdir(sctx, cur->dir, cur->dir_gen);  			if (ret < 0)  				goto out; @@ -4563,7 +4606,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  				ret = send_rmdir(sctx, valid_path);  				if (ret < 0)  					goto out; -				last_dir_ino_rm = cur->dir;  			}  		}  	} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d6e496436539..430e7419349c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1900,8 +1900,6 @@ static int btrfs_get_tree_super(struct fs_context *fc)  		return PTR_ERR(sb);  	} -	set_device_specific_options(fs_info); -  	if (sb->s_root) {  		/*  		 * Not the first mount of the fs thus got an existing super block. @@ -1946,6 +1944,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)  			deactivate_locked_super(sb);  			return -EACCES;  		} +		set_device_specific_options(fs_info);  		bdev = fs_devices->latest_dev->bdev;  		snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);  		shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); @@ -2069,7 +2068,13 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)  	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);  	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);  	if (!fs_info->super_copy || !fs_info->super_for_commit) { -		btrfs_free_fs_info(fs_info); +		/* +		 * Dont call btrfs_free_fs_info() to free it as it's still +		 * initialized partially. +		 */ +		kfree(fs_info->super_copy); +		kfree(fs_info->super_for_commit); +		kvfree(fs_info);  		return -ENOMEM;  	}  	btrfs_init_fs_info(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ca30b15ea452..c10b4c242acf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1797,7 +1797,7 @@ static int check_inode_extref(struct extent_buffer *leaf,  		struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;  		u16 namelen; -		if (unlikely(ptr + sizeof(*extref)) > end) { +		if (unlikely(ptr + sizeof(*extref) > end)) {  			inode_ref_err(leaf, slot,  			"inode extref overflow, ptr %lu end %lu inode_extref size %zu",  				      ptr, end, sizeof(*extref)); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e00036672f33..0ea0df18a8e4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1753,7 +1753,7 @@ out:  	    !fs_info->stripe_root) {  		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",  			  btrfs_bg_type_to_raid_name(map->type)); -		return -EINVAL; +		ret = -EINVAL;  	}  	if (unlikely(cache->alloc_offset > cache->zone_capacity)) { diff --git a/fs/coredump.c b/fs/coredump.c index b5fc06a092a4..5c1c381ee380 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1468,7 +1468,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,  	ssize_t retval;  	char old_core_pattern[CORENAME_MAX_SIZE]; -	if (write) +	if (!write)  		return proc_dostring(table, write, buffer, lenp, ppos);  	retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); @@ -1725,7 +1725,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,  	if (iov_iter_rw(iter) == WRITE) {  		lockdep_assert_held_write(&iomi.inode->i_rwsem);  		iomi.flags |= IOMAP_WRITE; -	} else { +	} else if (!sb_rdonly(iomi.inode->i_sb)) {  		lockdep_assert_held(&iomi.inode->i_rwsem);  	} diff --git a/fs/dcache.c b/fs/dcache.c index a067fa0a965a..035cccbc9276 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2557,6 +2557,8 @@ struct dentry *d_alloc_parallel(struct dentry *parent,  	spin_lock(&parent->d_lock);  	new->d_parent = dget_dlock(parent);  	hlist_add_head(&new->d_sib, &parent->d_children); +	if (parent->d_flags & DCACHE_DISCONNECTED) +		new->d_flags |= DCACHE_DISCONNECTED;  	spin_unlock(&parent->d_lock);  retry: diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index e5581dbeb4c2..c8d8e129eb4b 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -55,10 +55,6 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,  	} else {  		m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);  		m->clusterofs = le16_to_cpu(di->di_clusterofs); -		if (m->clusterofs >= 1 << vi->z_lclusterbits) { -			DBG_BUGON(1); -			return -EFSCORRUPTED; -		}  		m->pblk = le32_to_cpu(di->di_u.blkaddr);  	}  	return 0; @@ -240,21 +236,29 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,  static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,  					   unsigned int lcn, bool lookahead)  { +	struct erofs_inode *vi = EROFS_I(m->inode); +	int err; + +	if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT) { +		err = z_erofs_load_compact_lcluster(m, lcn, lookahead); +	} else { +		DBG_BUGON(vi->datalayout != EROFS_INODE_COMPRESSED_FULL); +		err = z_erofs_load_full_lcluster(m, lcn); +	} +	if (err) +		return err; +  	if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {  		erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu", -				m->type, lcn, EROFS_I(m->inode)->nid); +			  m->type, lcn, EROFS_I(m->inode)->nid);  		DBG_BUGON(1);  		return -EOPNOTSUPP; +	} else if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && +		   m->clusterofs >= (1 << vi->z_lclusterbits)) { +		DBG_BUGON(1); +		return -EFSCORRUPTED;  	} - -	switch (EROFS_I(m->inode)->datalayout) { -	case EROFS_INODE_COMPRESSED_FULL: -		return z_erofs_load_full_lcluster(m, lcn); -	case EROFS_INODE_COMPRESSED_COMPACT: -		return z_erofs_load_compact_lcluster(m, lcn, lookahead); -	default: -		return -EINVAL; -	} +	return 0;  }  static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, @@ -268,20 +272,19 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,  		unsigned long lcn = m->lcn - lookback_distance;  		int err; +		if (!lookback_distance) +			break; +  		err = z_erofs_load_lcluster_from_disk(m, lcn, false);  		if (err)  			return err; -  		if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {  			lookback_distance = m->delta[0]; -			if (!lookback_distance) -				break;  			continue; -		} else { -			m->headtype = m->type; -			m->map->m_la = (lcn << lclusterbits) | m->clusterofs; -			return 0;  		} +		m->headtype = m->type; +		m->map->m_la = (lcn << lclusterbits) | m->clusterofs; +		return 0;  	}  	erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",  		  lookback_distance, m->lcn, vi->nid); @@ -431,13 +434,6 @@ static int z_erofs_map_blocks_fo(struct inode *inode,  			end = inode->i_size;  	} else {  		if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) { -			/* m.lcn should be >= 1 if endoff < m.clusterofs */ -			if (!m.lcn) { -				erofs_err(sb, "invalid logical cluster 0 at nid %llu", -					  vi->nid); -				err = -EFSCORRUPTED; -				goto unmap_out; -			}  			end = (m.lcn << lclusterbits) | m.clusterofs;  			map->m_flags |= EROFS_MAP_FULL_MAPPED;  			m.delta[0] = 1; @@ -596,7 +592,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,  			vi->z_fragmentoff = map->m_plen;  			if (recsz > offsetof(struct z_erofs_extent, pstart_lo))  				vi->z_fragmentoff |= map->m_pa << 32; -		} else if (map->m_plen) { +		} else if (map->m_plen & Z_EROFS_EXTENT_PLEN_MASK) {  			map->m_flags |= EROFS_MAP_MAPPED |  				EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;  			fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; @@ -715,6 +711,7 @@ static int z_erofs_map_sanity_check(struct inode *inode,  				    struct erofs_map_blocks *map)  {  	struct erofs_sb_info *sbi = EROFS_I_SB(inode); +	u64 pend;  	if (!(map->m_flags & EROFS_MAP_ENCODED))  		return 0; @@ -732,6 +729,10 @@ static int z_erofs_map_sanity_check(struct inode *inode,  	if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||  		     map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))  		return -EOPNOTSUPP; +	/* Filesystems beyond 48-bit physical block addresses are invalid */ +	if (unlikely(check_add_overflow(map->m_pa, map->m_plen, &pend) || +		     (pend >> sbi->blkszbits) >= BIT_ULL(48))) +		return -EFSCORRUPTED;  	return 0;  } diff --git a/fs/exec.c b/fs/exec.c index 6b70c6726d31..4298e7e08d5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ  {  	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -	if (!error && !write) +	if (!error && write)  		validate_coredump_safety();  	return error;  } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 329697c89d09..38210fb6901c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -29,7 +29,6 @@ enum exfat_error_mode {  enum {  	NLS_NAME_NO_LOSSY =	0,	/* no lossy */  	NLS_NAME_LOSSY =	1 << 0,	/* just detected incorrect filename(s) */ -	NLS_NAME_OVERLEN =	1 << 1,	/* the length is over than its limit */  };  #define EXFAT_HASH_BITS		8 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index f246cf439588..adc37b4d7fc2 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -509,8 +509,8 @@ static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long ar  static int exfat_ioctl_set_volume_label(struct super_block *sb,  					unsigned long arg)  { -	int ret = 0, lossy; -	char label[FSLABEL_MAX]; +	int ret = 0, lossy, label_len; +	char label[FSLABEL_MAX] = {0};  	struct exfat_uni_name uniname;  	if (!capable(CAP_SYS_ADMIN)) @@ -520,8 +520,9 @@ static int exfat_ioctl_set_volume_label(struct super_block *sb,  		return -EFAULT;  	memset(&uniname, 0, sizeof(uniname)); +	label_len = strnlen(label, FSLABEL_MAX - 1);  	if (label[0]) { -		ret = exfat_nls_to_utf16(sb, label, FSLABEL_MAX, +		ret = exfat_nls_to_utf16(sb, label, label_len,  					 &uniname, &lossy);  		if (ret < 0)  			return ret; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 7eb9c67fd35f..745dce29ddb5 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -442,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,  		return namelen; /* return error value */  	if ((lossy && !lookup) || !namelen) -		return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL; +		return -EINVAL;  	return 0;  } @@ -642,10 +642,14 @@ static int exfat_find(struct inode *dir, const struct qstr *qname,  	info->type = exfat_get_entry_type(ep);  	info->attr = le16_to_cpu(ep->dentry.file.attr); -	info->size = le64_to_cpu(ep2->dentry.stream.valid_size);  	info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);  	info->size = le64_to_cpu(ep2->dentry.stream.size); +	if (info->valid_size < 0) { +		exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size); +		return -EIO; +	} +  	if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) {  		exfat_fs_error(sb, "data size is invalid(%lld)", info->size);  		return -EIO; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 8243d94ceaf4..57db08a5271c 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -616,9 +616,6 @@ static int exfat_nls_to_ucs2(struct super_block *sb,  		unilen++;  	} -	if (p_cstring[i] != '\0') -		lossy |= NLS_NAME_OVERLEN; -  	*uniname = '\0';  	p_uniname->name_len = unilen;  	p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b3e9b7bd7978..a0e66bc10093 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -280,9 +280,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,  		  bh, is_metadata, inode->i_mode,  		  test_opt(inode->i_sb, DATA_FLAGS)); -	/* In the no journal case, we can just do a bforget and return */ +	/* +	 * In the no journal case, we should wait for the ongoing buffer +	 * to complete and do a forget. +	 */  	if (!ext4_handle_valid(handle)) { -		bforget(bh); +		if (bh) { +			clear_buffer_dirty(bh); +			wait_on_buffer(bh); +			__bforget(bh); +		}  		return 0;  	} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..e99306a8f47c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5319,6 +5319,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,  	}  	ei->i_flags = le32_to_cpu(raw_inode->i_flags);  	ext4_set_inode_flags(inode, true); +	/* Detect invalid flag combination - can't have both inline data and extents */ +	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && +	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { +		ext4_error_inode(inode, function, line, 0, +			"inode has both inline data and extents flags"); +		ret = -EFSCORRUPTED; +		goto bad_inode; +	}  	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);  	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);  	if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 33c3a89396b1..82d5e7501455 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -513,7 +513,7 @@ void ext4_release_orphan_info(struct super_block *sb)  		return;  	for (i = 0; i < oi->of_blocks; i++)  		brelse(oi->of_binfo[i].ob_bh); -	kfree(oi->of_binfo); +	kvfree(oi->of_binfo);  }  static struct ext4_orphan_block_tail *ext4_orphan_block_tail( @@ -637,7 +637,7 @@ int ext4_init_orphan_info(struct super_block *sb)  out_free:  	for (i--; i >= 0; i--)  		brelse(oi->of_binfo[i].ob_bh); -	kfree(oi->of_binfo); +	kvfree(oi->of_binfo);  out_put:  	iput(inode);  	return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef38e62cda8f..775aa4f63aa3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1497,8 +1497,8 @@ static bool f2fs_map_blocks_cached(struct inode *inode,  		struct f2fs_dev_info *dev = &sbi->devs[bidx];  		map->m_bdev = dev->bdev; -		map->m_pblk -= dev->start_blk;  		map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); +		map->m_pblk -= dev->start_blk;  	} else {  		map->m_bdev = inode->i_sb->s_bdev;  	} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd8e7b0b2166..db7afb806411 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1820,7 +1820,7 @@ static int f2fs_drop_inode(struct inode *inode)  			sb_end_intwrite(inode->i_sb);  			spin_lock(&inode->i_lock); -			iput(inode); +			atomic_dec(&inode->i_count);  		}  		trace_f2fs_drop_inode(inode, 0);  		return 0; diff --git a/fs/file_attr.c b/fs/file_attr.c index 12424d4945d0..1dcec88c0680 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -84,7 +84,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)  	int error;  	if (!inode->i_op->fileattr_get) -		return -EOPNOTSUPP; +		return -ENOIOCTLCMD;  	error = security_inode_file_getattr(dentry, fa);  	if (error) @@ -270,7 +270,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,  	int err;  	if (!inode->i_op->fileattr_set) -		return -EOPNOTSUPP; +		return -ENOIOCTLCMD;  	if (!inode_owner_or_capable(idmap, inode))  		return -EPERM; @@ -312,8 +312,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp)  	int err;  	err = vfs_fileattr_get(file->f_path.dentry, &fa); -	if (err == -EOPNOTSUPP) -		err = -ENOIOCTLCMD;  	if (!err)  		err = put_user(fa.flags, argp);  	return err; @@ -335,8 +333,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)  			fileattr_fill_flags(&fa, flags);  			err = vfs_fileattr_set(idmap, dentry, &fa);  			mnt_drop_write_file(file); -			if (err == -EOPNOTSUPP) -				err = -ENOIOCTLCMD;  		}  	}  	return err; @@ -349,8 +345,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp)  	int err;  	err = vfs_fileattr_get(file->f_path.dentry, &fa); -	if (err == -EOPNOTSUPP) -		err = -ENOIOCTLCMD;  	if (!err)  		err = copy_fsxattr_to_user(&fa, argp); @@ -371,8 +365,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)  		if (!err) {  			err = vfs_fileattr_set(idmap, dentry, &fa);  			mnt_drop_write_file(file); -			if (err == -EOPNOTSUPP) -				err = -ENOIOCTLCMD;  		}  	}  	return err; @@ -424,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,  	}  	error = vfs_fileattr_get(filepath.dentry, &fa); +	if (error == -ENOIOCTLCMD || error == -ENOTTY) +		error = -EOPNOTSUPP;  	if (error)  		return error; @@ -491,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,  	if (!error) {  		error = vfs_fileattr_set(mnt_idmap(filepath.mnt),  					 filepath.dentry, &fa); +		if (error == -ENOIOCTLCMD || error == -ENOTTY) +			error = -EOPNOTSUPP;  		mnt_drop_write(filepath.mnt);  	} diff --git a/fs/file_table.c b/fs/file_table.c index b223d873e48b..cd4a3db4659a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -192,7 +192,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)  	f->f_sb_err	= 0;  	/* -	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While +	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While  	 * fget-rcu pattern users need to be able to handle spurious  	 * refcount bumps we should reinitialize the reused file first.  	 */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 57032eadca6c..fdc175e93f74 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -536,8 +536,6 @@ int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)  cleanup:  	fuse_priv_ioctl_cleanup(inode, ff); -	if (err == -ENOTTY) -		err = -EOPNOTSUPP;  	return err;  } @@ -574,7 +572,5 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,  cleanup:  	fuse_priv_ioctl_cleanup(inode, ff); -	if (err == -ENOTTY) -		err = -EOPNOTSUPP;  	return err;  } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9c94ed8c3ab0..f42548ee9083 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -478,14 +478,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,  		if (!hugetlb_vma_trylock_write(vma))  			continue; -		/* -		 * Skip VMAs without shareable locks. Per the design in commit -		 * 40549ba8f8e0, these will be handled by remove_inode_hugepages() -		 * called after this function with proper locking. -		 */ -		if (!__vma_shareable_lock(vma)) -			goto skip; -  		v_start = vma_offset_start(vma, start);  		v_end = vma_offset_end(vma, end); @@ -496,7 +488,6 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,  		 * vmas.  Therefore, lock is not held when calling  		 * unmap_hugepage_range for private vmas.  		 */ -skip:  		hugetlb_vma_unlock_write(vma);  	}  } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c7867139af69..3e510564de6e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1659,6 +1659,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)  	int drop_reserve = 0;  	int err = 0;  	int was_modified = 0; +	int wait_for_writeback = 0;  	if (is_handle_aborted(handle))  		return -EROFS; @@ -1782,18 +1783,22 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)  		}  		/* -		 * The buffer is still not written to disk, we should -		 * attach this buffer to current transaction so that the -		 * buffer can be checkpointed only after the current -		 * transaction commits. +		 * The buffer has not yet been written to disk. We should +		 * either clear the buffer or ensure that the ongoing I/O +		 * is completed, and attach this buffer to current +		 * transaction so that the buffer can be checkpointed only +		 * after the current transaction commits.  		 */  		clear_buffer_dirty(bh); +		wait_for_writeback = 1;  		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);  		spin_unlock(&journal->j_list_lock);  	}  drop:  	__brelse(bh);  	spin_unlock(&jh->b_state_lock); +	if (wait_for_writeback) +		wait_on_buffer(bh);  	jbd2_journal_put_journal_head(jh);  	if (drop_reserve) {  		/* no need to reserve log space for this block -bzzz */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index df01d2876b68..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -270,19 +270,31 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)  	mirror->layout = NULL;  } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, +							    gfp_t gfp_flags)  {  	struct nfs4_ff_layout_mirror *mirror; -	u32 dss_id;  	mirror = kzalloc(sizeof(*mirror), gfp_flags); -	if (mirror != NULL) { -		spin_lock_init(&mirror->lock); -		refcount_set(&mirror->ref, 1); -		INIT_LIST_HEAD(&mirror->mirrors); -		for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) -			nfs_localio_file_init(&mirror->dss[dss_id].nfl); +	if (mirror == NULL) +		return NULL; + +	spin_lock_init(&mirror->lock); +	refcount_set(&mirror->ref, 1); +	INIT_LIST_HEAD(&mirror->mirrors); + +	mirror->dss_count = dss_count; +	mirror->dss = +		kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), +			gfp_flags); +	if (mirror->dss == NULL) { +		kfree(mirror); +		return NULL;  	} + +	for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) +		nfs_localio_file_init(&mirror->dss[dss_id].nfl); +  	return mirror;  } @@ -507,17 +519,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,  		if (dss_count > 1 && stripe_unit == 0)  			goto out_err_free; -		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); +		fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags);  		if (fls->mirror_array[i] == NULL) {  			rc = -ENOMEM;  			goto out_err_free;  		} -		fls->mirror_array[i]->dss_count = dss_count; -		fls->mirror_array[i]->dss = -		    kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), -			    gfp_flags); -  		for (dss_id = 0; dss_id < dss_count; dss_id++) {  			dss_info = &fls->mirror_array[i]->dss[dss_id];  			dss_info->mirror = fls->mirror_array[i]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 6fddf43d729c..5998d6bd8a4f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -222,6 +222,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)  	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;  	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];  	clp->cl_mig_gen = 1; +	clp->cl_last_renewal = jiffies;  #if IS_ENABLED(CONFIG_NFS_V4_1)  	init_waitqueue_head(&clp->cl_lock_waitq);  #endif diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f58098417142..411776718494 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3636,6 +3636,7 @@ struct nfs4_closedata {  	} lr;  	struct nfs_fattr fattr;  	unsigned long timestamp; +	unsigned short retrans;  };  static void nfs4_free_closedata(void *data) @@ -3664,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  		.state = state,  		.inode = calldata->inode,  		.stateid = &calldata->arg.stateid, +		.retrans = calldata->retrans,  	};  	if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3711,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  		default:  			task->tk_status = nfs4_async_handle_exception(task,  					server, task->tk_status, &exception); +			calldata->retrans = exception.retrans;  			if (exception.retry)  				goto out_restart;  	} @@ -5593,9 +5596,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)  			.inode = hdr->inode,  			.state = hdr->args.context->state,  			.stateid = &hdr->args.stateid, +			.retrans = hdr->retrans,  		};  		task->tk_status = nfs4_async_handle_exception(task,  				server, task->tk_status, &exception); +		hdr->retrans = exception.retrans;  		if (exception.retry) {  			rpc_restart_call_prepare(task);  			return -EAGAIN; @@ -5709,10 +5714,12 @@ static int nfs4_write_done_cb(struct rpc_task *task,  			.inode = hdr->inode,  			.state = hdr->args.context->state,  			.stateid = &hdr->args.stateid, +			.retrans = hdr->retrans,  		};  		task->tk_status = nfs4_async_handle_exception(task,  				NFS_SERVER(inode), task->tk_status,  				&exception); +		hdr->retrans = exception.retrans;  		if (exception.retry) {  			rpc_restart_call_prepare(task);  			return -EAGAIN; @@ -6726,6 +6733,7 @@ struct nfs4_delegreturndata {  	struct nfs_fh fh;  	nfs4_stateid stateid;  	unsigned long timestamp; +	unsigned short retrans;  	struct {  		struct nfs4_layoutreturn_args arg;  		struct nfs4_layoutreturn_res res; @@ -6746,6 +6754,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)  		.inode = data->inode,  		.stateid = &data->stateid,  		.task_is_privileged = data->args.seq_args.sa_privileged, +		.retrans = data->retrans,  	};  	if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6817,6 +6826,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)  		task->tk_status = nfs4_async_handle_exception(task,  				data->res.server, task->tk_status,  				&exception); +		data->retrans = exception.retrans;  		if (exception.retry)  			goto out_restart;  	} @@ -7093,6 +7103,7 @@ struct nfs4_unlockdata {  	struct file_lock fl;  	struct nfs_server *server;  	unsigned long timestamp; +	unsigned short retrans;  };  static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -7147,6 +7158,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)  	struct nfs4_exception exception = {  		.inode = calldata->lsp->ls_state->inode,  		.stateid = &calldata->arg.stateid, +		.retrans = calldata->retrans,  	};  	if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -7180,6 +7192,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)  			task->tk_status = nfs4_async_handle_exception(task,  					calldata->server, task->tk_status,  					&exception); +			calldata->retrans = exception.retrans;  			if (exception.retry)  				rpc_restart_call_prepare(task);  	} diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0fb6905736d5..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1535,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task,  	/* Deal with the suid/sgid bit corner case */  	if (nfs_should_remove_suid(inode)) {  		spin_lock(&inode->i_lock); -		nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE +				| NFS_INO_REVAL_FORCED);  		spin_unlock(&inode->i_lock);  	}  	return 0; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index c318cf74e388..0f1a35400cd5 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -125,6 +125,13 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,  	return 0;  } +static __be32 +nfsd4_ff_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, +		struct nfsd4_layoutcommit *lcp) +{ +	return nfs_ok; +} +  const struct nfsd4_layout_ops ff_layout_ops = {  	.notify_types		=  			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, @@ -133,4 +140,5 @@ const struct nfsd4_layout_ops ff_layout_ops = {  	.encode_getdeviceinfo	= nfsd4_ff_encode_getdeviceinfo,  	.proc_layoutget		= nfsd4_ff_proc_layoutget,  	.encode_layoutget	= nfsd4_ff_encode_layoutget, +	.proc_layoutcommit	= nfsd4_ff_proc_layoutcommit,  }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e466cf52d7d7..7f7e6bb23a90 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -988,10 +988,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,  static void  nfsd4_read_release(union nfsd4_op_u *u)  { -	if (u->read.rd_nf) +	if (u->read.rd_nf) { +		trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, +				     u->read.rd_offset, u->read.rd_length);  		nfsd_file_put(u->read.rd_nf); -	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, -			     u->read.rd_offset, u->read.rd_length); +	}  }  static __be32 @@ -2892,10 +2893,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)  	rqstp->rq_lease_breaker = (void **)&cstate->clp; -	trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt); +	trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);  	while (!status && resp->opcnt < args->opcnt) {  		op = &args->ops[resp->opcnt++]; +		if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { +			/* If there are still more operations to process, +			 * stop here and report NFS4ERR_RESOURCE. */ +			if (cstate->minorversion == 0 && +			    args->client_opcnt > resp->opcnt) { +				op->status = nfserr_resource; +				goto encode_op; +			} +		} +  		/*  		 * The XDR decode routines may have pre-set op->status;  		 * for example, if there is a miscellaneous XDR error @@ -2972,7 +2983,7 @@ encode_op:  			status = op->status;  		} -		trace_nfsd_compound_status(args->opcnt, resp->opcnt, +		trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,  					   status, nfsd4_op_name(op->opnum));  		nfsd4_cstate_clear_replay(cstate); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81fa7cc6c77b..c1b54322c412 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3902,6 +3902,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs  	ca->headerpadsz = 0;  	ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);  	ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); +	ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);  	ca->maxresp_cached = min_t(u32, ca->maxresp_cached,  			NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);  	ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c0a3c6a7c8bb..6040a6145dad 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2488,8 +2488,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)  	if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)  		return false; -	if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) +	if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)  		return false; +	argp->opcnt = min_t(u32, argp->client_opcnt, +			    NFSD_MAX_OPS_PER_COMPOUND);  	if (argp->opcnt > ARRAY_SIZE(argp->iops)) {  		argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); @@ -2628,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,  	__be32 *p;  	__be32 pathlen;  	int pathlen_offset; -	int strlen, count=0;  	char *str, *end, *next; - -	dprintk("nfsd4_encode_components(%s)\n", components); +	int count = 0;  	pathlen_offset = xdr->buf->len;  	p = xdr_reserve_space(xdr, 4); @@ -2658,9 +2658,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,  			for (; *end && (*end != sep); end++)  				/* find sep or end of string */; -		strlen = end - str; -		if (strlen) { -			if (xdr_stream_encode_opaque(xdr, str, strlen) < 0) +		if (end > str) { +			if (xdr_stream_encode_opaque(xdr, str, end - str) < 0)  				return nfserr_resource;  			count++;  		} else @@ -2939,6 +2938,12 @@ struct nfsd4_fattr_args {  typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,  				const struct nfsd4_fattr_args *args); +static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr, +					 const struct nfsd4_fattr_args *args) +{ +	return nfserr_inval; +} +  static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,  					const struct nfsd4_fattr_args *args)  { @@ -3560,6 +3565,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {  	[FATTR4_MODE_UMASK]		= nfsd4_encode_fattr4__noop,  	[FATTR4_XATTR_SUPPORT]		= nfsd4_encode_fattr4_xattr_support, +	[FATTR4_TIME_DELEG_ACCESS]	= nfsd4_encode_fattr4__inval, +	[FATTR4_TIME_DELEG_MODIFY]	= nfsd4_encode_fattr4__inval,  	[FATTR4_OPEN_ARGUMENTS]		= nfsd4_encode_fattr4_open_arguments,  }; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index ea87b42894dd..f19320018639 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -57,6 +57,9 @@ struct readdir_cd {  	__be32			err;	/* 0, nfserr, or nfserr_eof */  }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND	200 +  struct nfsd_genl_rqstp {  	struct sockaddr		rq_daddr;  	struct sockaddr		rq_saddr; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d4b48602b2b0..ee0570cbdd9e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -903,6 +903,7 @@ struct nfsd4_compoundargs {  	char *				tag;  	u32				taglen;  	u32				minorversion; +	u32				client_opcnt;  	u32				opcnt;  	bool				splice_ok;  	struct nfsd4_op			*ops; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 1161eabf11ee..9cc7eb863643 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -17,6 +17,7 @@  #include "fanotify/fanotify.h"  #include "fdinfo.h"  #include "fsnotify.h" +#include "../internal.h"  #if defined(CONFIG_PROC_FS) @@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)  	size = f->handle_bytes >> 2; +	if (!super_trylock_shared(inode->i_sb)) +		return; +  	ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); +	up_read(&inode->i_sb->s_umount); +  	if ((ret == FILEID_INVALID) || (ret < 0))  		return; diff --git a/fs/nsfs.c b/fs/nsfs.c index 648dc59bef7f..79b026a36fb6 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -490,7 +490,9 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,  		VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);  		VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); -		VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + +		if (ns->inum != fid->ns_inum) +			return NULL;  		if (!__ns_ref_get(ns))  			return NULL; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 86f2631e6360..10923bf7c8b8 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -867,6 +867,11 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,  			mlog_errno(ret);  			goto out;  		} +		/* +		 * Invalidate extent cache after moving/defragging to prevent +		 * stale cached data with outdated extent flags. +		 */ +		ocfs2_extent_map_trunc(inode, cpos);  		context->clusters_moved += alloc_size;  next: diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aac7e34f56c1..604a82acd164 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -178,7 +178,7 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old,  	err = ovl_real_fileattr_get(old, &oldfa);  	if (err) {  		/* Ntfs-3g returns -EINVAL for "no fileattr support" */ -		if (err == -EOPNOTSUPP || err == -EINVAL) +		if (err == -ENOTTY || err == -EINVAL)  			return 0;  		pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",  			old->dentry, err); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fc52c796061d..7ab2c9daffd0 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -369,11 +369,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)  	if (!ovl_should_sync(OVL_FS(inode->i_sb)))  		ifl &= ~(IOCB_DSYNC | IOCB_SYNC); -	/* -	 * Overlayfs doesn't support deferred completions, don't copy -	 * this property in case it is set by the issuer. -	 */ -	ifl &= ~IOCB_DIO_CALLER_COMP;  	ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx);  out_unlock: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index aaa4cf579561..e11f310ce092 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -720,7 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)  	if (err)  		return err; -	return vfs_fileattr_get(realpath->dentry, fa); +	err = vfs_fileattr_get(realpath->dentry, fa); +	if (err == -ENOIOCTLCMD) +		err = -ENOTTY; +	return err;  }  int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 4076336fbba6..572a9925bd6c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1782,15 +1782,13 @@ int resctrl_mon_resource_init(void)  		mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;  	if (r->mon.mbm_cntr_assignable) { -		if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) -			resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); -		if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) -			resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); -		mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; -		mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & -								   (READS_TO_LOCAL_MEM | -								    READS_TO_LOCAL_S_MEM | -								    NON_TEMP_WRITE_TO_LOCAL_MEM); +		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) +			mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; +		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) +			mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & +									   (READS_TO_LOCAL_MEM | +									    READS_TO_LOCAL_S_MEM | +									    NON_TEMP_WRITE_TO_LOCAL_MEM);  		r->mon.mbm_assign_on_mkdir = true;  		resctrl_file_fflags_init("num_mbm_cntrs",  					 RFTYPE_MON_INFO | RFTYPE_RES_CACHE); diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index a4c02199fef4..17bd368574e9 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -5,17 +5,16 @@ config CIFS  	select NLS  	select NLS_UCS2_UTILS  	select CRYPTO -	select CRYPTO_MD5 -	select CRYPTO_SHA256 -	select CRYPTO_SHA512  	select CRYPTO_CMAC -	select CRYPTO_HMAC  	select CRYPTO_AEAD2  	select CRYPTO_CCM  	select CRYPTO_GCM  	select CRYPTO_ECB  	select CRYPTO_AES  	select CRYPTO_LIB_ARC4 +	select CRYPTO_LIB_MD5 +	select CRYPTO_LIB_SHA256 +	select CRYPTO_LIB_SHA512  	select KEYS  	select DNS_RESOLVER  	select ASN1 diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 63b3b1290bed..ce2ebc213a1d 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -339,7 +339,6 @@ int  sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid,  		struct cifs_fattr *fattr, uint sidtype)  { -	int rc = 0;  	struct key *sidkey;  	char *sidstr;  	const struct cred *saved_cred; @@ -446,12 +445,12 @@ out_revert_creds:  	 * fails then we just fall back to using the ctx->linux_uid/linux_gid.  	 */  got_valid_id: -	rc = 0;  	if (sidtype == SIDOWNER)  		fattr->cf_uid = fuid;  	else  		fattr->cf_gid = fgid; -	return rc; + +	return 0;  }  int diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 7b7c8c38fdd0..801824825ecf 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -24,14 +24,43 @@  #include <linux/iov_iter.h>  #include <crypto/aead.h>  #include <crypto/arc4.h> +#include <crypto/md5.h> +#include <crypto/sha2.h> -static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, -			      void *priv, void *priv2) +static int cifs_sig_update(struct cifs_calc_sig_ctx *ctx, +			   const u8 *data, size_t len)  { -	struct shash_desc *shash = priv; +	if (ctx->md5) { +		md5_update(ctx->md5, data, len); +		return 0; +	} +	if (ctx->hmac) { +		hmac_sha256_update(ctx->hmac, data, len); +		return 0; +	} +	return crypto_shash_update(ctx->shash, data, len); +} + +static int cifs_sig_final(struct cifs_calc_sig_ctx *ctx, u8 *out) +{ +	if (ctx->md5) { +		md5_final(ctx->md5, out); +		return 0; +	} +	if (ctx->hmac) { +		hmac_sha256_final(ctx->hmac, out); +		return 0; +	} +	return crypto_shash_final(ctx->shash, out); +} + +static size_t cifs_sig_step(void *iter_base, size_t progress, size_t len, +			    void *priv, void *priv2) +{ +	struct cifs_calc_sig_ctx *ctx = priv;  	int ret, *pret = priv2; -	ret = crypto_shash_update(shash, iter_base, len); +	ret = cifs_sig_update(ctx, iter_base, len);  	if (ret < 0) {  		*pret = ret;  		return len; @@ -42,21 +71,20 @@ static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,  /*   * Pass the data from an iterator into a hash.   */ -static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, -			   struct shash_desc *shash) +static int cifs_sig_iter(const struct iov_iter *iter, size_t maxsize, +			 struct cifs_calc_sig_ctx *ctx)  {  	struct iov_iter tmp_iter = *iter;  	int err = -EIO; -	if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, -				       cifs_shash_step) != maxsize) +	if (iterate_and_advance_kernel(&tmp_iter, maxsize, ctx, &err, +				       cifs_sig_step) != maxsize)  		return err;  	return 0;  } -int __cifs_calc_signature(struct smb_rqst *rqst, -			  struct TCP_Server_Info *server, char *signature, -			  struct shash_desc *shash) +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, +			  char *signature, struct cifs_calc_sig_ctx *ctx)  {  	int i;  	ssize_t rc; @@ -82,8 +110,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst,  			return -EIO;  		} -		rc = crypto_shash_update(shash, -					 iov[i].iov_base, iov[i].iov_len); +		rc = cifs_sig_update(ctx, iov[i].iov_base, iov[i].iov_len);  		if (rc) {  			cifs_dbg(VFS, "%s: Could not update with payload\n",  				 __func__); @@ -91,11 +118,11 @@ int __cifs_calc_signature(struct smb_rqst *rqst,  		}  	} -	rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); +	rc = cifs_sig_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), ctx);  	if (rc < 0)  		return rc; -	rc = crypto_shash_final(shash, signature); +	rc = cifs_sig_final(ctx, signature);  	if (rc)  		cifs_dbg(VFS, "%s: Could not generate hash\n", __func__); @@ -112,29 +139,22 @@ int __cifs_calc_signature(struct smb_rqst *rqst,  static int cifs_calc_signature(struct smb_rqst *rqst,  			struct TCP_Server_Info *server, char *signature)  { -	int rc; +	struct md5_ctx ctx;  	if (!rqst->rq_iov || !signature || !server)  		return -EINVAL; - -	rc = cifs_alloc_hash("md5", &server->secmech.md5); -	if (rc) -		return -1; - -	rc = crypto_shash_init(server->secmech.md5); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not init md5\n", __func__); -		return rc; +	if (fips_enabled) { +		cifs_dbg(VFS, +			 "MD5 signature support is disabled due to FIPS\n"); +		return -EOPNOTSUPP;  	} -	rc = crypto_shash_update(server->secmech.md5, -		server->session_key.response, server->session_key.len); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not update with response\n", __func__); -		return rc; -	} +	md5_init(&ctx); +	md5_update(&ctx, server->session_key.response, server->session_key.len); -	return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); +	return __cifs_calc_signature( +		rqst, server, signature, +		&(struct cifs_calc_sig_ctx){ .md5 = &ctx });  }  /* must be called with server->srv_mutex held */ @@ -405,11 +425,11 @@ static __le64 find_timestamp(struct cifs_ses *ses)  }  static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, -			    const struct nls_table *nls_cp, struct shash_desc *hmacmd5) +			    const struct nls_table *nls_cp)  { -	int rc = 0;  	int len;  	char nt_hash[CIFS_NTHASH_SIZE]; +	struct hmac_md5_ctx hmac_ctx;  	__le16 *user;  	wchar_t *domain;  	wchar_t *server; @@ -417,17 +437,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  	/* calculate md4 hash of password */  	E_md4hash(ses->password, nt_hash, nls_cp); -	rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc); -		return rc; -	} - -	rc = crypto_shash_init(hmacmd5); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); -		return rc; -	} +	hmac_md5_init_usingrawkey(&hmac_ctx, nt_hash, CIFS_NTHASH_SIZE);  	/* convert ses->user_name to unicode */  	len = ses->user_name ? strlen(ses->user_name) : 0; @@ -442,12 +452,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  		*(u16 *)user = 0;  	} -	rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len); +	hmac_md5_update(&hmac_ctx, (const u8 *)user, 2 * len);  	kfree(user); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc); -		return rc; -	}  	/* convert ses->domainName to unicode and uppercase */  	if (ses->domainName) { @@ -459,12 +465,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,  				      nls_cp); -		rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len); +		hmac_md5_update(&hmac_ctx, (const u8 *)domain, 2 * len);  		kfree(domain); -		if (rc) { -			cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc); -			return rc; -		}  	} else {  		/* We use ses->ip_addr if no domain name available */  		len = strlen(ses->ip_addr); @@ -474,25 +476,16 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  			return -ENOMEM;  		len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); -		rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len); +		hmac_md5_update(&hmac_ctx, (const u8 *)server, 2 * len);  		kfree(server); -		if (rc) { -			cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc); -			return rc; -		}  	} -	rc = crypto_shash_final(hmacmd5, ntlmv2_hash); -	if (rc) -		cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - -	return rc; +	hmac_md5_final(&hmac_ctx, ntlmv2_hash); +	return 0;  } -static int -CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5) +static void CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)  { -	int rc;  	struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)  	    (ses->auth_key.response + CIFS_SESS_KEY_SIZE);  	unsigned int hash_len; @@ -501,35 +494,15 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_  	hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +  		offsetof(struct ntlmv2_resp, challenge.key[0])); -	rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); -		return rc; -	} - -	rc = crypto_shash_init(hmacmd5); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); -		return rc; -	} -  	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)  		memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);  	else  		memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); -	rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); -		return rc; -	} - -	/* Note that the MD5 digest over writes anon.challenge_key.key */ -	rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash); -	if (rc) -		cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - -	return rc; +	/* Note that the HMAC-MD5 value overwrites ntlmv2->challenge.key */ +	hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, +			     ntlmv2->challenge.key, hash_len, +			     ntlmv2->ntlmv2_hash);  }  /* @@ -586,7 +559,6 @@ out:  int  setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)  { -	struct shash_desc *hmacmd5 = NULL;  	unsigned char *tiblob = NULL; /* target info blob */  	struct ntlmv2_resp *ntlmv2;  	char ntlmv2_hash[16]; @@ -657,51 +629,29 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)  	ntlmv2->client_chal = cc;  	ntlmv2->reserved2 = 0; -	rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); -	if (rc) { -		cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); +	if (fips_enabled) { +		cifs_dbg(VFS, "NTLMv2 support is disabled due to FIPS\n"); +		rc = -EOPNOTSUPP;  		goto unlock;  	}  	/* calculate ntlmv2_hash */ -	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5); +	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);  	if (rc) {  		cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc);  		goto unlock;  	}  	/* calculate first part of the client response (CR1) */ -	rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5); -	if (rc) { -		cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc); -		goto unlock; -	} +	CalcNTLMv2_response(ses, ntlmv2_hash);  	/* now calculate the session key for NTLMv2 */ -	rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); -		goto unlock; -	} - -	rc = crypto_shash_init(hmacmd5); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); -		goto unlock; -	} - -	rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); -		goto unlock; -	} - -	rc = crypto_shash_final(hmacmd5, ses->auth_key.response); -	if (rc) -		cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); +	hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, +			     ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, +			     ses->auth_key.response); +	rc = 0;  unlock:  	cifs_server_unlock(ses->server); -	cifs_free_hash(&hmacmd5);  setup_ntlmv2_rsp_ret:  	kfree_sensitive(tiblob); @@ -743,9 +693,6 @@ void  cifs_crypto_secmech_release(struct TCP_Server_Info *server)  {  	cifs_free_hash(&server->secmech.aes_cmac); -	cifs_free_hash(&server->secmech.hmacsha256); -	cifs_free_hash(&server->secmech.md5); -	cifs_free_hash(&server->secmech.sha512);  	if (server->secmech.enc) {  		crypto_free_aead(server->secmech.enc); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 05b1fa76e8cc..185ac41bd7e9 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -173,7 +173,7 @@ module_param(enable_oplocks, bool, 0644);  MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");  module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/1");  module_param(require_gcm_256, bool, 0644);  MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); @@ -2139,13 +2139,9 @@ MODULE_DESCRIPTION  	"also older servers complying with the SNIA CIFS Specification)");  MODULE_VERSION(CIFS_VERSION);  MODULE_SOFTDEP("ecb"); -MODULE_SOFTDEP("hmac"); -MODULE_SOFTDEP("md5");  MODULE_SOFTDEP("nls");  MODULE_SOFTDEP("aes");  MODULE_SOFTDEP("cmac"); -MODULE_SOFTDEP("sha256"); -MODULE_SOFTDEP("sha512");  MODULE_SOFTDEP("aead2");  MODULE_SOFTDEP("ccm");  MODULE_SOFTDEP("gcm"); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 8f6f567d7474..203e2aaa3c25 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -24,6 +24,7 @@  #include "cifsacl.h"  #include <crypto/internal/hash.h>  #include <uapi/linux/cifs/cifs_mount.h> +#include "../common/cifsglob.h"  #include "../common/smb2pdu.h"  #include "smb2pdu.h"  #include <linux/filelock.h> @@ -221,9 +222,6 @@ struct session_key {  /* crypto hashing related structure/fields, not specific to a sec mech */  struct cifs_secmech { -	struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ -	struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ -	struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */  	struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */  	struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ @@ -536,8 +534,6 @@ struct smb_version_operations {  	void (*new_lease_key)(struct cifs_fid *);  	int (*generate_signingkey)(struct cifs_ses *ses,  				   struct TCP_Server_Info *server); -	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *, -				bool allocate_crypto);  	int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,  			     struct cifsFileInfo *src_file);  	int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, @@ -702,12 +698,6 @@ get_rfc1002_length(void *buf)  	return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;  } -static inline void -inc_rfc1001_len(void *buf, int count) -{ -	be32_add_cpu((__be32 *)buf, count); -} -  struct TCP_Server_Info {  	struct list_head tcp_ses_list;  	struct list_head smb_ses_list; @@ -740,7 +730,7 @@ struct TCP_Server_Info {  	bool nosharesock;  	bool tcp_nodelay;  	bool terminate; -	unsigned int credits;  /* send no more requests at once */ +	int credits;  /* send no more requests at once */  	unsigned int max_credits; /* can override large 32000 default at mnt */  	unsigned int in_flight;  /* number of requests on the wire to server */  	unsigned int max_in_flight; /* max number of requests that were on wire */ @@ -1021,8 +1011,6 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)  #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)  #define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) -#define CIFS_DEFAULT_IOSIZE (1024 * 1024) -  /*   * Windows only supports a max of 60kb reads and 65535 byte writes. Default to   * those values when posix extensions aren't in force. In actuality here, we @@ -2148,30 +2136,20 @@ extern mempool_t cifs_io_request_pool;  extern mempool_t cifs_io_subrequest_pool;  /* Operations for different SMB versions */ -#define SMB1_VERSION_STRING	"1.0" -#define SMB20_VERSION_STRING    "2.0"  #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY  extern struct smb_version_operations smb1_operations;  extern struct smb_version_values smb1_values;  extern struct smb_version_operations smb20_operations;  extern struct smb_version_values smb20_values;  #endif /* CIFS_ALLOW_INSECURE_LEGACY */ -#define SMB21_VERSION_STRING	"2.1"  extern struct smb_version_operations smb21_operations;  extern struct smb_version_values smb21_values; -#define SMBDEFAULT_VERSION_STRING "default"  extern struct smb_version_values smbdefault_values; -#define SMB3ANY_VERSION_STRING "3"  extern struct smb_version_values smb3any_values; -#define SMB30_VERSION_STRING	"3.0"  extern struct smb_version_operations smb30_operations;  extern struct smb_version_values smb30_values; -#define SMB302_VERSION_STRING	"3.02" -#define ALT_SMB302_VERSION_STRING "3.0.2"  /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */  extern struct smb_version_values smb302_values; -#define SMB311_VERSION_STRING	"3.1.1" -#define ALT_SMB311_VERSION_STRING "3.11"  extern struct smb_version_operations smb311_operations;  extern struct smb_version_values smb311_values; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index e8fba98690ce..3528c365a452 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -9,6 +9,7 @@  #define _CIFSPROTO_H  #include <linux/nls.h>  #include <linux/ctype.h> +#include "cifsglob.h"  #include "trace.h"  #ifdef CONFIG_CIFS_DFS_UPCALL  #include "dfs_cache.h" @@ -615,6 +616,8 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,  extern struct TCP_Server_Info *  cifs_find_tcp_session(struct smb3_fs_context *ctx); +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal); +  void __cifs_put_smb_ses(struct cifs_ses *ses);  extern struct cifs_ses * @@ -632,9 +635,13 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,  			   struct cifs_sb_info *cifs_sb,  			   const unsigned char *path, char *pbuf,  			   unsigned int *pbytes_written); -int __cifs_calc_signature(struct smb_rqst *rqst, -			struct TCP_Server_Info *server, char *signature, -			struct shash_desc *shash); +struct cifs_calc_sig_ctx { +	struct md5_ctx *md5; +	struct hmac_sha256_ctx *hmac; +	struct shash_desc *shash; +}; +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, +			  char *signature, struct cifs_calc_sig_ctx *ctx);  enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,  					enum securityEnum); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 2881efcbe09a..7da194f29fef 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1311,6 +1311,8 @@ cifs_readv_callback(struct mid_q_entry *mid)  		.rreq_debug_id = rdata->rreq->debug_id,  		.rreq_debug_index = rdata->subreq.debug_index,  	}; +	unsigned int rreq_debug_id = rdata->rreq->debug_id; +	unsigned int subreq_debug_index = rdata->subreq.debug_index;  	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",  		 __func__, mid->mid, mid->mid_state, rdata->result, @@ -1374,6 +1376,9 @@ do_retry:  			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);  	} +	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, +			      server->credits, server->in_flight, +			      0, cifs_trace_rw_credits_read_response_clear);  	rdata->credits.value = 0;  	rdata->subreq.error = rdata->result;  	rdata->subreq.transferred += rdata->got_bytes; @@ -1381,6 +1386,9 @@ do_retry:  	netfs_read_subreq_terminated(&rdata->subreq);  	release_mid(mid);  	add_credits(server, &credits, 0); +	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, +			      server->credits, server->in_flight, +			      credits.value, cifs_trace_rw_credits_read_response_add);  }  /* cifs_async_readv - send an async write, and set up mid to handle result */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dd12f3eb61dc..55cb4b0cbd48 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -310,6 +310,8 @@ cifs_abort_connection(struct TCP_Server_Info *server)  			 server->ssocket->flags);  		sock_release(server->ssocket);  		server->ssocket = NULL; +	} else if (cifs_rdma_enabled(server)) { +		smbd_destroy(server);  	}  	server->sequence_number = 0;  	server->session_estab = false; @@ -338,12 +340,6 @@ cifs_abort_connection(struct TCP_Server_Info *server)  		mid_execute_callback(mid);  		release_mid(mid);  	} - -	if (cifs_rdma_enabled(server)) { -		cifs_server_lock(server); -		smbd_destroy(server); -		cifs_server_unlock(server); -	}  }  static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) @@ -2015,39 +2011,31 @@ static int match_session(struct cifs_ses *ses,  /**   * cifs_setup_ipc - helper to setup the IPC tcon for the session   * @ses: smb session to issue the request on - * @ctx: the superblock configuration context to use for building the - *       new tree connection for the IPC (interprocess communication RPC) + * @seal: if encryption is requested   *   * A new IPC connection is made and stored in the session   * tcon_ipc. The IPC tcon has the same lifetime as the session.   */ -static int -cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal)  {  	int rc = 0, xid;  	struct cifs_tcon *tcon;  	char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; -	bool seal = false;  	struct TCP_Server_Info *server = ses->server;  	/*  	 * If the mount request that resulted in the creation of the  	 * session requires encryption, force IPC to be encrypted too.  	 */ -	if (ctx->seal) { -		if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) -			seal = true; -		else { -			cifs_server_dbg(VFS, -				 "IPC: server doesn't support encryption\n"); -			return -EOPNOTSUPP; -		} +	if (seal && !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) { +		cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n"); +		return ERR_PTR(-EOPNOTSUPP);  	}  	/* no need to setup directory caching on IPC share, so pass in false */  	tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc);  	if (tcon == NULL) -		return -ENOMEM; +		return ERR_PTR(-ENOMEM);  	spin_lock(&server->srv_lock);  	scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname); @@ -2057,13 +2045,13 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)  	tcon->ses = ses;  	tcon->ipc = true;  	tcon->seal = seal; -	rc = server->ops->tree_connect(xid, ses, unc, tcon, ctx->local_nls); +	rc = server->ops->tree_connect(xid, ses, unc, tcon, ses->local_nls);  	free_xid(xid);  	if (rc) { -		cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); +		cifs_server_dbg(VFS | ONCE, "failed to connect to IPC (rc=%d)\n", rc);  		tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail); -		goto out; +		return ERR_PTR(rc);  	}  	cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); @@ -2071,9 +2059,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)  	spin_lock(&tcon->tc_lock);  	tcon->status = TID_GOOD;  	spin_unlock(&tcon->tc_lock); -	ses->tcon_ipc = tcon; -out: -	return rc; +	return tcon;  }  static struct cifs_ses * @@ -2347,6 +2333,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)  {  	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;  	struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; +	struct cifs_tcon *ipc;  	struct cifs_ses *ses;  	unsigned int xid;  	int retries = 0; @@ -2525,7 +2512,12 @@ retry_new_session:  	list_add(&ses->smb_ses_list, &server->smb_ses_list);  	spin_unlock(&cifs_tcp_ses_lock); -	cifs_setup_ipc(ses, ctx); +	ipc = cifs_setup_ipc(ses, ctx->seal); +	spin_lock(&cifs_tcp_ses_lock); +	spin_lock(&ses->ses_lock); +	ses->tcon_ipc = !IS_ERR(ipc) ? ipc : NULL; +	spin_unlock(&ses->ses_lock); +	spin_unlock(&cifs_tcp_ses_lock);  	free_xid(xid); diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 4dada26d56b5..f2ad0ccd08a7 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1120,24 +1120,63 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)  	return match;  } -static bool is_ses_good(struct cifs_ses *ses) +static bool is_ses_good(struct cifs_tcon *tcon, struct cifs_ses *ses)  {  	struct TCP_Server_Info *server = ses->server; -	struct cifs_tcon *tcon = ses->tcon_ipc; +	struct cifs_tcon *ipc = NULL;  	bool ret; +	spin_lock(&cifs_tcp_ses_lock);  	spin_lock(&ses->ses_lock);  	spin_lock(&ses->chan_lock); +  	ret = !cifs_chan_needs_reconnect(ses, server) && -		ses->ses_status == SES_GOOD && -		!tcon->need_reconnect; +		ses->ses_status == SES_GOOD; +  	spin_unlock(&ses->chan_lock); + +	if (!ret) +		goto out; + +	if (likely(ses->tcon_ipc)) { +		if (ses->tcon_ipc->need_reconnect) { +			ret = false; +			goto out; +		} +	} else { +		spin_unlock(&ses->ses_lock); +		spin_unlock(&cifs_tcp_ses_lock); + +		ipc = cifs_setup_ipc(ses, tcon->seal); + +		spin_lock(&cifs_tcp_ses_lock); +		spin_lock(&ses->ses_lock); +		if (!IS_ERR(ipc)) { +			if (!ses->tcon_ipc) { +				ses->tcon_ipc = ipc; +				ipc = NULL; +			} +		} else { +			ret = false; +			ipc = NULL; +		} +	} + +out:  	spin_unlock(&ses->ses_lock); +	spin_unlock(&cifs_tcp_ses_lock); +	if (ipc && server->ops->tree_disconnect) { +		unsigned int xid = get_xid(); + +		(void)server->ops->tree_disconnect(xid, ipc); +		_free_xid(xid); +	} +	tconInfoFree(ipc, netfs_trace_tcon_ref_free_ipc);  	return ret;  }  /* Refresh dfs referral of @ses */ -static void refresh_ses_referral(struct cifs_ses *ses) +static void refresh_ses_referral(struct cifs_tcon *tcon, struct cifs_ses *ses)  {  	struct cache_entry *ce;  	unsigned int xid; @@ -1153,7 +1192,7 @@ static void refresh_ses_referral(struct cifs_ses *ses)  	}  	ses = CIFS_DFS_ROOT_SES(ses); -	if (!is_ses_good(ses)) { +	if (!is_ses_good(tcon, ses)) {  		cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",  			 __func__);  		goto out; @@ -1241,7 +1280,7 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)  	up_read(&htable_rw_lock);  	ses = CIFS_DFS_ROOT_SES(ses); -	if (!is_ses_good(ses)) { +	if (!is_ses_good(tcon, ses)) {  		cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",  			 __func__);  		goto out; @@ -1309,7 +1348,7 @@ void dfs_cache_refresh(struct work_struct *work)  	tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);  	list_for_each_entry(ses, &tcon->dfs_ses_list, dlist) -		refresh_ses_referral(ses); +		refresh_ses_referral(tcon, ses);  	refresh_tcon_referral(tcon, false);  	queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 239dd84a336f..cac355364e43 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2431,8 +2431,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,  	tcon = tlink_tcon(tlink);  	server = tcon->ses->server; -	if (!server->ops->rename) -		return -ENOSYS; +	if (!server->ops->rename) { +		rc = -ENOSYS; +		goto do_rename_exit; +	}  	/* try path-based rename first */  	rc = server->ops->rename(xid, tcon, from_dentry, @@ -2482,11 +2484,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,  	}  #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */  do_rename_exit: -	if (rc == 0) { +	if (rc == 0)  		d_move(from_dentry, to_dentry); -		/* Force a new lookup */ -		d_drop(from_dentry); -	}  	cifs_put_tlink(tlink);  	return rc;  } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index fe80e711cd75..70f3c0c67eeb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -5,6 +5,7 @@   *   Author(s): Steve French (sfrench@us.ibm.com)   *   */ +#include <crypto/md5.h>  #include <linux/fs.h>  #include <linux/stat.h>  #include <linux/slab.h> @@ -37,23 +38,6 @@  #define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash  static int -symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) -{ -	int rc; -	struct shash_desc *md5 = NULL; - -	rc = cifs_alloc_hash("md5", &md5); -	if (rc) -		return rc; - -	rc = crypto_shash_digest(md5, link_str, link_len, md5_hash); -	if (rc) -		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); -	cifs_free_hash(&md5); -	return rc; -} - -static int  parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,  		 char **_link_str)  { @@ -77,11 +61,7 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,  	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)  		return -EINVAL; -	rc = symlink_hash(link_len, link_str, md5_hash); -	if (rc) { -		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); -		return rc; -	} +	md5(link_str, link_len, md5_hash);  	scnprintf(md5_str2, sizeof(md5_str2),  		  CIFS_MF_SYMLINK_MD5_FORMAT, @@ -103,7 +83,6 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,  static int  format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)  { -	int rc;  	unsigned int link_len;  	unsigned int ofs;  	u8 md5_hash[16]; @@ -116,11 +95,7 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)  	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)  		return -ENAMETOOLONG; -	rc = symlink_hash(link_len, link_str, md5_hash); -	if (rc) { -		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); -		return rc; -	} +	md5(link_str, link_len, md5_hash);  	scnprintf(buf, buf_len,  		  CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index dda6dece802a..e10123d8cd7d 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -916,6 +916,14 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,  	char *data_end;  	struct dfs_referral_level_3 *ref; +	if (rsp_size < sizeof(*rsp)) { +		cifs_dbg(VFS | ONCE, +			 "%s: header is malformed (size is %u, must be %zu)\n", +			 __func__, rsp_size, sizeof(*rsp)); +		rc = -EINVAL; +		goto parse_DFS_referrals_exit; +	} +  	*num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals);  	if (*num_of_nodes < 1) { @@ -925,6 +933,15 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size,  		goto parse_DFS_referrals_exit;  	} +	if (sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3) > rsp_size) { +		cifs_dbg(VFS | ONCE, +			 "%s: malformed buffer (size is %u, must be at least %zu)\n", +			 __func__, rsp_size, +			 sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3)); +		rc = -EINVAL; +		goto parse_DFS_referrals_exit; +	} +  	ref = (struct dfs_referral_level_3 *) &(rsp->referrals);  	if (ref->VersionNumber != cpu_to_le16(3)) {  		cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0a8c2fcc9ded..ef3b498b0a02 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -584,7 +584,7 @@ cifs_ses_add_channel(struct cifs_ses *ses,  	 * to sign packets before we generate the channel signing key  	 * (we sign with the session key)  	 */ -	rc = smb311_crypto_shash_allocate(chan->server); +	rc = smb3_crypto_shash_allocate(chan->server);  	if (rc) {  		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);  		mutex_unlock(&ses->session_mutex); diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 89d933b4a8bc..96bfe4c63ccf 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -7,6 +7,7 @@   *              Pavel Shilovsky (pshilovsky@samba.org) 2012   *   */ +#include <crypto/sha2.h>  #include <linux/ctype.h>  #include "cifsglob.h"  #include "cifsproto.h" @@ -888,13 +889,13 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve   * @iov:	array containing the SMB request we will send to the server   * @nvec:	number of array entries for the iov   */ -int +void  smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,  			   struct kvec *iov, int nvec)  { -	int i, rc; +	int i;  	struct smb2_hdr *hdr; -	struct shash_desc *sha512 = NULL; +	struct sha512_ctx sha_ctx;  	hdr = (struct smb2_hdr *)iov[0].iov_base;  	/* neg prot are always taken */ @@ -907,52 +908,22 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,  	 * and we can test it. Preauth requires 3.1.1 for now.  	 */  	if (server->dialect != SMB311_PROT_ID) -		return 0; +		return;  	if (hdr->Command != SMB2_SESSION_SETUP) -		return 0; +		return;  	/* skip last sess setup response */  	if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)  	    && (hdr->Status == NT_STATUS_OK  		|| (hdr->Status !=  		    cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) -		return 0; +		return;  ok: -	rc = smb311_crypto_shash_allocate(server); -	if (rc) -		return rc; - -	sha512 = server->secmech.sha512; -	rc = crypto_shash_init(sha512); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); -		return rc; -	} - -	rc = crypto_shash_update(sha512, ses->preauth_sha_hash, -				 SMB2_PREAUTH_HASH_SIZE); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); -		return rc; -	} - -	for (i = 0; i < nvec; i++) { -		rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); -		if (rc) { -			cifs_dbg(VFS, "%s: Could not update sha512 shash\n", -				 __func__); -			return rc; -		} -	} - -	rc = crypto_shash_final(sha512, ses->preauth_sha_hash); -	if (rc) { -		cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", -			 __func__); -		return rc; -	} - -	return 0; +	sha512_init(&sha_ctx); +	sha512_update(&sha_ctx, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); +	for (i = 0; i < nvec; i++) +		sha512_update(&sha_ctx, iov[i].iov_base, iov[i].iov_len); +	sha512_final(&sha_ctx, ses->preauth_sha_hash);  } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7c392cf5940b..1e39f2165e42 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2799,11 +2799,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,  	struct cifs_fid fid;  	int rc;  	__le16 *utf16_path; -	struct cached_fid *cfid = NULL; +	struct cached_fid *cfid;  	int retries = 0, cur_sleep = 1;  replay_again:  	/* reinitialize for possible replay */ +	cfid = NULL;  	flags = CIFS_CP_CREATE_CLOSE_OP;  	oplock = SMB2_OPLOCK_LEVEL_NONE;  	server = cifs_pick_channel(ses); @@ -3212,8 +3213,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,  	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);  	if (!utf16_path) {  		rc = -ENOMEM; -		free_xid(xid); -		return ERR_PTR(rc); +		goto put_tlink;  	}  	oparms = (struct cifs_open_parms) { @@ -3245,6 +3245,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,  		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);  	} +put_tlink:  	cifs_put_tlink(tlink);  	free_xid(xid); @@ -3285,8 +3286,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen,  	utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);  	if (!utf16_path) {  		rc = -ENOMEM; -		free_xid(xid); -		return rc; +		goto put_tlink;  	}  	oparms = (struct cifs_open_parms) { @@ -3307,6 +3307,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen,  		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);  	} +put_tlink:  	cifs_put_tlink(tlink);  	free_xid(xid);  	return rc; @@ -5446,7 +5447,6 @@ struct smb_version_operations smb20_operations = {  	.get_lease_key = smb2_get_lease_key,  	.set_lease_key = smb2_set_lease_key,  	.new_lease_key = smb2_new_lease_key, -	.calc_signature = smb2_calc_signature,  	.is_read_op = smb2_is_read_op,  	.set_oplock_level = smb2_set_oplock_level,  	.create_lease_buf = smb2_create_lease_buf, @@ -5550,7 +5550,6 @@ struct smb_version_operations smb21_operations = {  	.get_lease_key = smb2_get_lease_key,  	.set_lease_key = smb2_set_lease_key,  	.new_lease_key = smb2_new_lease_key, -	.calc_signature = smb2_calc_signature,  	.is_read_op = smb21_is_read_op,  	.set_oplock_level = smb21_set_oplock_level,  	.create_lease_buf = smb2_create_lease_buf, @@ -5660,7 +5659,6 @@ struct smb_version_operations smb30_operations = {  	.set_lease_key = smb2_set_lease_key,  	.new_lease_key = smb2_new_lease_key,  	.generate_signingkey = generate_smb30signingkey, -	.calc_signature = smb3_calc_signature,  	.set_integrity  = smb3_set_integrity,  	.is_read_op = smb21_is_read_op,  	.set_oplock_level = smb3_set_oplock_level, @@ -5777,7 +5775,6 @@ struct smb_version_operations smb311_operations = {  	.set_lease_key = smb2_set_lease_key,  	.new_lease_key = smb2_new_lease_key,  	.generate_signingkey = generate_smb311signingkey, -	.calc_signature = smb3_calc_signature,  	.set_integrity  = smb3_set_integrity,  	.is_read_op = smb21_is_read_op,  	.set_oplock_level = smb3_set_oplock_level, diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index b3f1398c9f79..5241daaae543 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -39,12 +39,6 @@ extern struct mid_q_entry *smb2_setup_async_request(  			struct TCP_Server_Info *server, struct smb_rqst *rqst);  extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,  						__u64 ses_id, __u32  tid); -extern int smb2_calc_signature(struct smb_rqst *rqst, -				struct TCP_Server_Info *server, -				bool allocate_crypto); -extern int smb3_calc_signature(struct smb_rqst *rqst, -				struct TCP_Server_Info *server, -				bool allocate_crypto);  extern void smb2_echo_request(struct work_struct *work);  extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);  extern bool smb2_is_valid_oplock_break(char *buffer, @@ -295,10 +289,10 @@ extern int smb2_validate_and_copy_iov(unsigned int offset,  extern void smb2_copy_fs_info_to_kstatfs(  	 struct smb2_fs_full_size_info *pfs_inf,  	 struct kstatfs *kst); -extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); -extern int smb311_update_preauth_hash(struct cifs_ses *ses, -				      struct TCP_Server_Info *server, -				      struct kvec *iov, int nvec); +extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server); +extern void smb311_update_preauth_hash(struct cifs_ses *ses, +				       struct TCP_Server_Info *server, +				       struct kvec *iov, int nvec);  extern int smb2_query_info_compound(const unsigned int xid,  				    struct cifs_tcon *tcon,  				    const char *path, u32 desired_access, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 33f33013b392..6a9b80385b86 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -19,6 +19,7 @@  #include <linux/mempool.h>  #include <linux/highmem.h>  #include <crypto/aead.h> +#include <crypto/sha2.h>  #include "cifsglob.h"  #include "cifsproto.h"  #include "smb2proto.h" @@ -26,53 +27,14 @@  #include "../common/smb2status.h"  #include "smb2glob.h" -static int +int  smb3_crypto_shash_allocate(struct TCP_Server_Info *server)  {  	struct cifs_secmech *p = &server->secmech; -	int rc; -	rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); -	if (rc) -		goto err; - -	rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); -	if (rc) -		goto err; - -	return 0; -err: -	cifs_free_hash(&p->hmacsha256); -	return rc; +	return cifs_alloc_hash("cmac(aes)", &p->aes_cmac);  } -int -smb311_crypto_shash_allocate(struct TCP_Server_Info *server) -{ -	struct cifs_secmech *p = &server->secmech; -	int rc = 0; - -	rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); -	if (rc) -		return rc; - -	rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); -	if (rc) -		goto err; - -	rc = cifs_alloc_hash("sha512", &p->sha512); -	if (rc) -		goto err; - -	return 0; - -err: -	cifs_free_hash(&p->aes_cmac); -	cifs_free_hash(&p->hmacsha256); -	return rc; -} - -  static  int smb3_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)  { @@ -247,16 +209,15 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32  tid)  	return tcon;  } -int +static int  smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, -			bool allocate_crypto) +		    bool allocate_crypto)  {  	int rc;  	unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; -	unsigned char *sigptr = smb2_signature;  	struct kvec *iov = rqst->rq_iov;  	struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; -	struct shash_desc *shash = NULL; +	struct hmac_sha256_ctx hmac_ctx;  	struct smb_rqst drqst;  	__u64 sid = le64_to_cpu(shdr->SessionId);  	u8 key[SMB2_NTLMV2_SESSKEY_SIZE]; @@ -271,30 +232,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,  	memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);  	memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); -	if (allocate_crypto) { -		rc = cifs_alloc_hash("hmac(sha256)", &shash); -		if (rc) { -			cifs_server_dbg(VFS, -					"%s: sha256 alloc failed\n", __func__); -			goto out; -		} -	} else { -		shash = server->secmech.hmacsha256; -	} - -	rc = crypto_shash_setkey(shash->tfm, key, sizeof(key)); -	if (rc) { -		cifs_server_dbg(VFS, -				"%s: Could not update with response\n", -				__func__); -		goto out; -	} - -	rc = crypto_shash_init(shash); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not init sha256", __func__); -		goto out; -	} +	hmac_sha256_init_usingrawkey(&hmac_ctx, key, sizeof(key));  	/*  	 * For SMB2+, __cifs_calc_signature() expects to sign only the actual @@ -305,25 +243,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,  	 */  	drqst = *rqst;  	if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { -		rc = crypto_shash_update(shash, iov[0].iov_base, -					 iov[0].iov_len); -		if (rc) { -			cifs_server_dbg(VFS, -					"%s: Could not update with payload\n", -					__func__); -			goto out; -		} +		hmac_sha256_update(&hmac_ctx, iov[0].iov_base, iov[0].iov_len);  		drqst.rq_iov++;  		drqst.rq_nvec--;  	} -	rc = __cifs_calc_signature(&drqst, server, sigptr, shash); +	rc = __cifs_calc_signature( +		&drqst, server, smb2_signature, +		&(struct cifs_calc_sig_ctx){ .hmac = &hmac_ctx });  	if (!rc) -		memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); +		memcpy(shdr->Signature, smb2_signature, SMB2_SIGNATURE_SIZE); -out: -	if (allocate_crypto) -		cifs_free_hash(&shash);  	return rc;  } @@ -336,8 +266,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,  	__u8 L256[4] = {0, 0, 1, 0};  	int rc = 0;  	unsigned char prfhash[SMB2_HMACSHA256_SIZE]; -	unsigned char *hashptr = prfhash;  	struct TCP_Server_Info *server = ses->server; +	struct hmac_sha256_ctx hmac_ctx;  	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);  	memset(key, 0x0, key_size); @@ -345,67 +275,26 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,  	rc = smb3_crypto_shash_allocate(server);  	if (rc) {  		cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); -		goto smb3signkey_ret; -	} - -	rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, -		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); -		goto smb3signkey_ret; -	} - -	rc = crypto_shash_init(server->secmech.hmacsha256); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); -		goto smb3signkey_ret; -	} - -	rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); -		goto smb3signkey_ret; -	} - -	rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); -		goto smb3signkey_ret; +		return rc;  	} -	rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); -		goto smb3signkey_ret; -	} - -	rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); -		goto smb3signkey_ret; -	} +	hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response, +				     SMB2_NTLMV2_SESSKEY_SIZE); +	hmac_sha256_update(&hmac_ctx, i, 4); +	hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len); +	hmac_sha256_update(&hmac_ctx, &zero, 1); +	hmac_sha256_update(&hmac_ctx, context.iov_base, context.iov_len);  	if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||  		(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { -		rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); +		hmac_sha256_update(&hmac_ctx, L256, 4);  	} else { -		rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); -	} -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); -		goto smb3signkey_ret; +		hmac_sha256_update(&hmac_ctx, L128, 4);  	} +	hmac_sha256_final(&hmac_ctx, prfhash); -	rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); -	if (rc) { -		cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); -		goto smb3signkey_ret; -	} - -	memcpy(key, hashptr, key_size); - -smb3signkey_ret: -	return rc; +	memcpy(key, prfhash, key_size); +	return 0;  }  struct derivation { @@ -576,19 +465,21 @@ generate_smb311signingkey(struct cifs_ses *ses,  	return generate_smb3signingkey(ses, server, &triplet);  } -int +static int  smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, -			bool allocate_crypto) +		    bool allocate_crypto)  {  	int rc;  	unsigned char smb3_signature[SMB2_CMACAES_SIZE]; -	unsigned char *sigptr = smb3_signature;  	struct kvec *iov = rqst->rq_iov;  	struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;  	struct shash_desc *shash = NULL;  	struct smb_rqst drqst;  	u8 key[SMB3_SIGN_KEY_SIZE]; +	if (server->vals->protocol_id <= SMB21_PROT_ID) +		return smb2_calc_signature(rqst, server, allocate_crypto); +  	rc = smb3_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);  	if (unlikely(rc)) {  		cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__); @@ -643,9 +534,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,  		drqst.rq_nvec--;  	} -	rc = __cifs_calc_signature(&drqst, server, sigptr, shash); +	rc = __cifs_calc_signature( +		&drqst, server, smb3_signature, +		&(struct cifs_calc_sig_ctx){ .shash = shash });  	if (!rc) -		memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); +		memcpy(shdr->Signature, smb3_signature, SMB2_SIGNATURE_SIZE);  out:  	if (allocate_crypto) @@ -657,7 +550,6 @@ out:  static int  smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)  { -	int rc = 0;  	struct smb2_hdr *shdr;  	struct smb2_sess_setup_req *ssr;  	bool is_binding; @@ -684,9 +576,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)  		return 0;  	} -	rc = server->ops->calc_signature(rqst, server, false); - -	return rc; +	return smb3_calc_signature(rqst, server, false);  }  int @@ -722,7 +612,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)  	memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE); -	rc = server->ops->calc_signature(rqst, server, true); +	rc = smb3_calc_signature(rqst, server, true);  	if (rc)  		return rc; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 316f398c70f4..85a4c55b61b8 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -172,6 +172,7 @@ static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)  	 * in order to notice the broken connection.  	 */  	wake_up_all(&sc->status_wait); +	wake_up_all(&sc->send_io.lcredits.wait_queue);  	wake_up_all(&sc->send_io.credits.wait_queue);  	wake_up_all(&sc->send_io.pending.dec_wait_queue);  	wake_up_all(&sc->send_io.pending.zero_wait_queue); @@ -495,6 +496,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)  	struct smbdirect_send_io *request =  		container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);  	struct smbdirect_socket *sc = request->socket; +	int lcredits = 0;  	log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",  		request, ib_wc_status_msg(wc->status)); @@ -504,22 +506,24 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)  			request->sge[i].addr,  			request->sge[i].length,  			DMA_TO_DEVICE); +	mempool_free(request, sc->send_io.mem.pool); +	lcredits += 1;  	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {  		if (wc->status != IB_WC_WR_FLUSH_ERR)  			log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",  				ib_wc_status_msg(wc->status), wc->opcode); -		mempool_free(request, sc->send_io.mem.pool);  		smbd_disconnect_rdma_connection(sc);  		return;  	} +	atomic_add(lcredits, &sc->send_io.lcredits.count); +	wake_up(&sc->send_io.lcredits.wait_queue); +  	if (atomic_dec_and_test(&sc->send_io.pending.count))  		wake_up(&sc->send_io.pending.zero_wait_queue);  	wake_up(&sc->send_io.pending.dec_wait_queue); - -	mempool_free(request, sc->send_io.mem.pool);  }  static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp) @@ -567,6 +571,7 @@ static bool process_negotiation_response(  		log_rdma_event(ERR, "error: credits_granted==0\n");  		return false;  	} +	atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);  	atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));  	if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { @@ -1114,6 +1119,24 @@ static int smbd_post_send_iter(struct smbdirect_socket *sc,  	struct smbdirect_data_transfer *packet;  	int new_credits = 0; +wait_lcredit: +	/* Wait for local send credits */ +	rc = wait_event_interruptible(sc->send_io.lcredits.wait_queue, +		atomic_read(&sc->send_io.lcredits.count) > 0 || +		sc->status != SMBDIRECT_SOCKET_CONNECTED); +	if (rc) +		goto err_wait_lcredit; + +	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { +		log_outgoing(ERR, "disconnected not sending on wait_credit\n"); +		rc = -EAGAIN; +		goto err_wait_lcredit; +	} +	if (unlikely(atomic_dec_return(&sc->send_io.lcredits.count) < 0)) { +		atomic_inc(&sc->send_io.lcredits.count); +		goto wait_lcredit; +	} +  wait_credit:  	/* Wait for send credits. A SMBD packet needs one credit */  	rc = wait_event_interruptible(sc->send_io.credits.wait_queue, @@ -1132,23 +1155,6 @@ wait_credit:  		goto wait_credit;  	} -wait_send_queue: -	wait_event(sc->send_io.pending.dec_wait_queue, -		atomic_read(&sc->send_io.pending.count) < sp->send_credit_target || -		sc->status != SMBDIRECT_SOCKET_CONNECTED); - -	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { -		log_outgoing(ERR, "disconnected not sending on wait_send_queue\n"); -		rc = -EAGAIN; -		goto err_wait_send_queue; -	} - -	if (unlikely(atomic_inc_return(&sc->send_io.pending.count) > -				sp->send_credit_target)) { -		atomic_dec(&sc->send_io.pending.count); -		goto wait_send_queue; -	} -  	request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);  	if (!request) {  		rc = -ENOMEM; @@ -1229,10 +1235,21 @@ wait_send_queue:  		     le32_to_cpu(packet->data_length),  		     le32_to_cpu(packet->remaining_data_length)); +	/* +	 * Now that we got a local and a remote credit +	 * we add us as pending +	 */ +	atomic_inc(&sc->send_io.pending.count); +  	rc = smbd_post_send(sc, request);  	if (!rc)  		return 0; +	if (atomic_dec_and_test(&sc->send_io.pending.count)) +		wake_up(&sc->send_io.pending.zero_wait_queue); + +	wake_up(&sc->send_io.pending.dec_wait_queue); +  err_dma:  	for (i = 0; i < request->num_sge; i++)  		if (request->sge[i].addr) @@ -1246,14 +1263,14 @@ err_dma:  	atomic_sub(new_credits, &sc->recv_io.credits.count);  err_alloc: -	if (atomic_dec_and_test(&sc->send_io.pending.count)) -		wake_up(&sc->send_io.pending.zero_wait_queue); - -err_wait_send_queue: -	/* roll back send credits and pending */  	atomic_inc(&sc->send_io.credits.count); +	wake_up(&sc->send_io.credits.wait_queue);  err_wait_credit: +	atomic_inc(&sc->send_io.lcredits.count); +	wake_up(&sc->send_io.lcredits.wait_queue); + +err_wait_lcredit:  	return rc;  } @@ -1575,12 +1592,12 @@ void smbd_destroy(struct TCP_Server_Info *server)  	disable_work_sync(&sc->disconnect_work);  	log_rdma_event(INFO, "destroying rdma session\n"); -	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { +	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)  		smbd_disconnect_rdma_work(&sc->disconnect_work); +	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {  		log_rdma_event(INFO, "wait for transport being disconnected\n"); -		wait_event_interruptible( -			sc->status_wait, -			sc->status == SMBDIRECT_SOCKET_DISCONNECTED); +		wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); +		log_rdma_event(INFO, "waited for transport being disconnected\n");  	}  	/* @@ -1624,19 +1641,7 @@ void smbd_destroy(struct TCP_Server_Info *server)  	log_rdma_event(INFO, "free receive buffers\n");  	destroy_receive_buffers(sc); -	/* -	 * For performance reasons, memory registration and deregistration -	 * are not locked by srv_mutex. It is possible some processes are -	 * blocked on transport srv_mutex while holding memory registration. -	 * Release the transport srv_mutex to allow them to hit the failure -	 * path when sending data, and then release memory registrations. -	 */  	log_rdma_event(INFO, "freeing mr list\n"); -	while (atomic_read(&sc->mr_io.used.count)) { -		cifs_server_unlock(server); -		msleep(1000); -		cifs_server_lock(server); -	}  	destroy_mr_list(sc);  	ib_free_cq(sc->ib.send_cq); @@ -1779,6 +1784,7 @@ static struct smbd_connection *_smbd_get_connection(  	struct smbdirect_socket *sc;  	struct smbdirect_socket_parameters *sp;  	struct rdma_conn_param conn_param; +	struct ib_qp_cap qp_cap;  	struct ib_qp_init_attr qp_attr;  	struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;  	struct ib_port_immutable port_immutable; @@ -1850,6 +1856,25 @@ static struct smbd_connection *_smbd_get_connection(  		goto config_failed;  	} +	sp->responder_resources = +		min_t(u8, sp->responder_resources, +		      sc->ib.dev->attrs.max_qp_rd_atom); +	log_rdma_mr(INFO, "responder_resources=%d\n", +		sp->responder_resources); + +	/* +	 * We use allocate sp->responder_resources * 2 MRs +	 * and each MR needs WRs for REG and INV, so +	 * we use '* 4'. +	 * +	 * +1 for ib_drain_qp() +	 */ +	memset(&qp_cap, 0, sizeof(qp_cap)); +	qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1; +	qp_cap.max_recv_wr = sp->recv_credit_max + 1; +	qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; +	qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; +  	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);  	if (IS_ERR(sc->ib.pd)) {  		rc = PTR_ERR(sc->ib.pd); @@ -1860,7 +1885,7 @@ static struct smbd_connection *_smbd_get_connection(  	sc->ib.send_cq =  		ib_alloc_cq_any(sc->ib.dev, sc, -				sp->send_credit_target, IB_POLL_SOFTIRQ); +				qp_cap.max_send_wr, IB_POLL_SOFTIRQ);  	if (IS_ERR(sc->ib.send_cq)) {  		sc->ib.send_cq = NULL;  		goto alloc_cq_failed; @@ -1868,7 +1893,7 @@ static struct smbd_connection *_smbd_get_connection(  	sc->ib.recv_cq =  		ib_alloc_cq_any(sc->ib.dev, sc, -				sp->recv_credit_max, IB_POLL_SOFTIRQ); +				qp_cap.max_recv_wr, IB_POLL_SOFTIRQ);  	if (IS_ERR(sc->ib.recv_cq)) {  		sc->ib.recv_cq = NULL;  		goto alloc_cq_failed; @@ -1877,11 +1902,7 @@ static struct smbd_connection *_smbd_get_connection(  	memset(&qp_attr, 0, sizeof(qp_attr));  	qp_attr.event_handler = smbd_qp_async_error_upcall;  	qp_attr.qp_context = sc; -	qp_attr.cap.max_send_wr = sp->send_credit_target; -	qp_attr.cap.max_recv_wr = sp->recv_credit_max; -	qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; -	qp_attr.cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; -	qp_attr.cap.max_inline_data = 0; +	qp_attr.cap = qp_cap;  	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;  	qp_attr.qp_type = IB_QPT_RC;  	qp_attr.send_cq = sc->ib.send_cq; @@ -1895,12 +1916,6 @@ static struct smbd_connection *_smbd_get_connection(  	}  	sc->ib.qp = sc->rdma.cm_id->qp; -	sp->responder_resources = -		min_t(u8, sp->responder_resources, -		      sc->ib.dev->attrs.max_qp_rd_atom); -	log_rdma_mr(INFO, "responder_resources=%d\n", -		sp->responder_resources); -  	memset(&conn_param, 0, sizeof(conn_param));  	conn_param.initiator_depth = sp->initiator_depth;  	conn_param.responder_resources = sp->responder_resources; @@ -2352,18 +2367,84 @@ static void smbd_mr_recovery_work(struct work_struct *work)  	}  } +static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr) +{ +	struct smbdirect_socket *sc = mr->socket; + +	lockdep_assert_held(&mr->mutex); + +	if (mr->state == SMBDIRECT_MR_DISABLED) +		return; + +	if (mr->mr) +		ib_dereg_mr(mr->mr); +	if (mr->sgt.nents) +		ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); +	kfree(mr->sgt.sgl); + +	mr->mr = NULL; +	mr->sgt.sgl = NULL; +	mr->sgt.nents = 0; + +	mr->state = SMBDIRECT_MR_DISABLED; +} + +static void smbd_mr_free_locked(struct kref *kref) +{ +	struct smbdirect_mr_io *mr = +		container_of(kref, struct smbdirect_mr_io, kref); + +	lockdep_assert_held(&mr->mutex); + +	/* +	 * smbd_mr_disable_locked() should already be called! +	 */ +	if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) +		smbd_mr_disable_locked(mr); + +	mutex_unlock(&mr->mutex); +	mutex_destroy(&mr->mutex); +	kfree(mr); +} +  static void destroy_mr_list(struct smbdirect_socket *sc)  {  	struct smbdirect_mr_io *mr, *tmp; +	LIST_HEAD(all_list); +	unsigned long flags;  	disable_work_sync(&sc->mr_io.recovery_work); -	list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { -		if (mr->state == SMBDIRECT_MR_INVALIDATED) -			ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, -				mr->sgt.nents, mr->dir); -		ib_dereg_mr(mr->mr); -		kfree(mr->sgt.sgl); -		kfree(mr); + +	spin_lock_irqsave(&sc->mr_io.all.lock, flags); +	list_splice_tail_init(&sc->mr_io.all.list, &all_list); +	spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + +	list_for_each_entry_safe(mr, tmp, &all_list, list) { +		mutex_lock(&mr->mutex); + +		smbd_mr_disable_locked(mr); +		list_del(&mr->list); +		mr->socket = NULL; + +		/* +		 * No kref_put_mutex() as it's already locked. +		 * +		 * If smbd_mr_free_locked() is called +		 * and the mutex is unlocked and mr is gone, +		 * in that case kref_put() returned 1. +		 * +		 * If kref_put() returned 0 we know that +		 * smbd_mr_free_locked() didn't +		 * run. Not by us nor by anyone else, as we +		 * still hold the mutex, so we need to unlock. +		 * +		 * If the mr is still registered it will +		 * be dangling (detached from the connection +		 * waiting for smbd_deregister_mr() to be +		 * called in order to free the memory. +		 */ +		if (!kref_put(&mr->kref, smbd_mr_free_locked)) +			mutex_unlock(&mr->mutex);  	}  } @@ -2377,10 +2458,9 @@ static void destroy_mr_list(struct smbdirect_socket *sc)  static int allocate_mr_list(struct smbdirect_socket *sc)  {  	struct smbdirect_socket_parameters *sp = &sc->parameters; -	int i; -	struct smbdirect_mr_io *smbdirect_mr, *tmp; - -	INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); +	struct smbdirect_mr_io *mr; +	int ret; +	u32 i;  	if (sp->responder_resources == 0) {  		log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); @@ -2389,42 +2469,52 @@ static int allocate_mr_list(struct smbdirect_socket *sc)  	/* Allocate more MRs (2x) than hardware responder_resources */  	for (i = 0; i < sp->responder_resources * 2; i++) { -		smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); -		if (!smbdirect_mr) -			goto cleanup_entries; -		smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, -					sp->max_frmr_depth); -		if (IS_ERR(smbdirect_mr->mr)) { +		mr = kzalloc(sizeof(*mr), GFP_KERNEL); +		if (!mr) { +			ret = -ENOMEM; +			goto kzalloc_mr_failed; +		} + +		kref_init(&mr->kref); +		mutex_init(&mr->mutex); + +		mr->mr = ib_alloc_mr(sc->ib.pd, +				     sc->mr_io.type, +				     sp->max_frmr_depth); +		if (IS_ERR(mr->mr)) { +			ret = PTR_ERR(mr->mr);  			log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",  				    sc->mr_io.type, sp->max_frmr_depth); -			goto out; +			goto ib_alloc_mr_failed;  		} -		smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, -						sizeof(struct scatterlist), -						GFP_KERNEL); -		if (!smbdirect_mr->sgt.sgl) { + +		mr->sgt.sgl = kcalloc(sp->max_frmr_depth, +				      sizeof(struct scatterlist), +				      GFP_KERNEL); +		if (!mr->sgt.sgl) { +			ret = -ENOMEM;  			log_rdma_mr(ERR, "failed to allocate sgl\n"); -			ib_dereg_mr(smbdirect_mr->mr); -			goto out; +			goto kcalloc_sgl_failed;  		} -		smbdirect_mr->state = SMBDIRECT_MR_READY; -		smbdirect_mr->socket = sc; +		mr->state = SMBDIRECT_MR_READY; +		mr->socket = sc; -		list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); +		list_add_tail(&mr->list, &sc->mr_io.all.list);  		atomic_inc(&sc->mr_io.ready.count);  	} + +	INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); +  	return 0; -out: -	kfree(smbdirect_mr); -cleanup_entries: -	list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { -		list_del(&smbdirect_mr->list); -		ib_dereg_mr(smbdirect_mr->mr); -		kfree(smbdirect_mr->sgt.sgl); -		kfree(smbdirect_mr); -	} -	return -ENOMEM; +kcalloc_sgl_failed: +	ib_dereg_mr(mr->mr); +ib_alloc_mr_failed: +	mutex_destroy(&mr->mutex); +	kfree(mr); +kzalloc_mr_failed: +	destroy_mr_list(sc); +	return ret;  }  /* @@ -2458,6 +2548,7 @@ again:  	list_for_each_entry(ret, &sc->mr_io.all.list, list) {  		if (ret->state == SMBDIRECT_MR_READY) {  			ret->state = SMBDIRECT_MR_REGISTERED; +			kref_get(&ret->kref);  			spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);  			atomic_dec(&sc->mr_io.ready.count);  			atomic_inc(&sc->mr_io.used.count); @@ -2504,9 +2595,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,  {  	struct smbdirect_socket *sc = &info->socket;  	struct smbdirect_socket_parameters *sp = &sc->parameters; -	struct smbdirect_mr_io *smbdirect_mr; +	struct smbdirect_mr_io *mr;  	int rc, num_pages; -	enum dma_data_direction dir;  	struct ib_reg_wr *reg_wr;  	num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); @@ -2517,49 +2607,47 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,  		return NULL;  	} -	smbdirect_mr = get_mr(sc); -	if (!smbdirect_mr) { +	mr = get_mr(sc); +	if (!mr) {  		log_rdma_mr(ERR, "get_mr returning NULL\n");  		return NULL;  	} -	dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; -	smbdirect_mr->dir = dir; -	smbdirect_mr->need_invalidate = need_invalidate; -	smbdirect_mr->sgt.nents = 0; -	smbdirect_mr->sgt.orig_nents = 0; +	mutex_lock(&mr->mutex); + +	mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +	mr->need_invalidate = need_invalidate; +	mr->sgt.nents = 0; +	mr->sgt.orig_nents = 0;  	log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",  		    num_pages, iov_iter_count(iter), sp->max_frmr_depth); -	smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); +	smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth); -	rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, -			   smbdirect_mr->sgt.nents, dir); +	rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);  	if (!rc) {  		log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", -			num_pages, dir, rc); +			    num_pages, mr->dir, rc);  		goto dma_map_error;  	} -	rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, -			  smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); -	if (rc != smbdirect_mr->sgt.nents) { +	rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); +	if (rc != mr->sgt.nents) {  		log_rdma_mr(ERR, -			"ib_map_mr_sg failed rc = %d nents = %x\n", -			rc, smbdirect_mr->sgt.nents); +			    "ib_map_mr_sg failed rc = %d nents = %x\n", +			    rc, mr->sgt.nents);  		goto map_mr_error;  	} -	ib_update_fast_reg_key(smbdirect_mr->mr, -		ib_inc_rkey(smbdirect_mr->mr->rkey)); -	reg_wr = &smbdirect_mr->wr; +	ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); +	reg_wr = &mr->wr;  	reg_wr->wr.opcode = IB_WR_REG_MR; -	smbdirect_mr->cqe.done = register_mr_done; -	reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; +	mr->cqe.done = register_mr_done; +	reg_wr->wr.wr_cqe = &mr->cqe;  	reg_wr->wr.num_sge = 0;  	reg_wr->wr.send_flags = IB_SEND_SIGNALED; -	reg_wr->mr = smbdirect_mr->mr; -	reg_wr->key = smbdirect_mr->mr->rkey; +	reg_wr->mr = mr->mr; +	reg_wr->key = mr->mr->rkey;  	reg_wr->access = writing ?  			IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :  			IB_ACCESS_REMOTE_READ; @@ -2570,24 +2658,51 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,  	 * on the next ib_post_send when we actually send I/O to remote peer  	 */  	rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); -	if (!rc) -		return smbdirect_mr; +	if (!rc) { +		/* +		 * get_mr() gave us a reference +		 * via kref_get(&mr->kref), we keep that and let +		 * the caller use smbd_deregister_mr() +		 * to remove it again. +		 */ +		mutex_unlock(&mr->mutex); +		return mr; +	}  	log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",  		rc, reg_wr->key);  	/* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/  map_mr_error: -	ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, -			smbdirect_mr->sgt.nents, smbdirect_mr->dir); +	ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);  dma_map_error: -	smbdirect_mr->state = SMBDIRECT_MR_ERROR; +	mr->sgt.nents = 0; +	mr->state = SMBDIRECT_MR_ERROR;  	if (atomic_dec_and_test(&sc->mr_io.used.count))  		wake_up(&sc->mr_io.cleanup.wait_queue);  	smbd_disconnect_rdma_connection(sc); +	/* +	 * get_mr() gave us a reference +	 * via kref_get(&mr->kref), we need to remove it again +	 * on error. +	 * +	 * No kref_put_mutex() as it's already locked. +	 * +	 * If smbd_mr_free_locked() is called +	 * and the mutex is unlocked and mr is gone, +	 * in that case kref_put() returned 1. +	 * +	 * If kref_put() returned 0 we know that +	 * smbd_mr_free_locked() didn't +	 * run. Not by us nor by anyone else, as we +	 * still hold the mutex, so we need to unlock. +	 */ +	if (!kref_put(&mr->kref, smbd_mr_free_locked)) +		mutex_unlock(&mr->mutex); +  	return NULL;  } @@ -2612,44 +2727,55 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)   * and we have to locally invalidate the buffer to prevent data is being   * modified by remote peer after upper layer consumes it   */ -int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) +void smbd_deregister_mr(struct smbdirect_mr_io *mr)  { -	struct ib_send_wr *wr; -	struct smbdirect_socket *sc = smbdirect_mr->socket; -	int rc = 0; +	struct smbdirect_socket *sc = mr->socket; + +	mutex_lock(&mr->mutex); +	if (mr->state == SMBDIRECT_MR_DISABLED) +		goto put_kref; + +	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { +		smbd_mr_disable_locked(mr); +		goto put_kref; +	} + +	if (mr->need_invalidate) { +		struct ib_send_wr *wr = &mr->inv_wr; +		int rc; -	if (smbdirect_mr->need_invalidate) {  		/* Need to finish local invalidation before returning */ -		wr = &smbdirect_mr->inv_wr;  		wr->opcode = IB_WR_LOCAL_INV; -		smbdirect_mr->cqe.done = local_inv_done; -		wr->wr_cqe = &smbdirect_mr->cqe; +		mr->cqe.done = local_inv_done; +		wr->wr_cqe = &mr->cqe;  		wr->num_sge = 0; -		wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; +		wr->ex.invalidate_rkey = mr->mr->rkey;  		wr->send_flags = IB_SEND_SIGNALED; -		init_completion(&smbdirect_mr->invalidate_done); +		init_completion(&mr->invalidate_done);  		rc = ib_post_send(sc->ib.qp, wr, NULL);  		if (rc) {  			log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); +			smbd_mr_disable_locked(mr);  			smbd_disconnect_rdma_connection(sc);  			goto done;  		} -		wait_for_completion(&smbdirect_mr->invalidate_done); -		smbdirect_mr->need_invalidate = false; +		wait_for_completion(&mr->invalidate_done); +		mr->need_invalidate = false;  	} else  		/*  		 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED  		 * and defer to mr_recovery_work to recover the MR for next use  		 */ -		smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; +		mr->state = SMBDIRECT_MR_INVALIDATED; -	if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { -		ib_dma_unmap_sg( -			sc->ib.dev, smbdirect_mr->sgt.sgl, -			smbdirect_mr->sgt.nents, -			smbdirect_mr->dir); -		smbdirect_mr->state = SMBDIRECT_MR_READY; +	if (mr->sgt.nents) { +		ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); +		mr->sgt.nents = 0; +	} + +	if (mr->state == SMBDIRECT_MR_INVALIDATED) { +		mr->state = SMBDIRECT_MR_READY;  		if (atomic_inc_return(&sc->mr_io.ready.count) == 1)  			wake_up(&sc->mr_io.ready.wait_queue);  	} else @@ -2663,7 +2789,23 @@ done:  	if (atomic_dec_and_test(&sc->mr_io.used.count))  		wake_up(&sc->mr_io.cleanup.wait_queue); -	return rc; +put_kref: +	/* +	 * No kref_put_mutex() as it's already locked. +	 * +	 * If smbd_mr_free_locked() is called +	 * and the mutex is unlocked and mr is gone, +	 * in that case kref_put() returned 1. +	 * +	 * If kref_put() returned 0 we know that +	 * smbd_mr_free_locked() didn't +	 * run. Not by us nor by anyone else, as we +	 * still hold the mutex, so we need to unlock +	 * and keep the mr in SMBDIRECT_MR_READY or +	 * SMBDIRECT_MR_ERROR state. +	 */ +	if (!kref_put(&mr->kref, smbd_mr_free_locked)) +		mutex_unlock(&mr->mutex);  }  static bool smb_set_sge(struct smb_extract_to_rdma *rdma, diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index d67ac5ddaff4..577d37dbeb8a 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -60,7 +60,7 @@ int smbd_send(struct TCP_Server_Info *server,  struct smbdirect_mr_io *smbd_register_mr(  	struct smbd_connection *info, struct iov_iter *iter,  	bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbdirect_mr_io *mr); +void smbd_deregister_mr(struct smbdirect_mr_io *mr);  #else  #define cifs_rdma_enabled(server)	0 diff --git a/fs/smb/client/trace.c b/fs/smb/client/trace.c index 465483787193..16b0e719731f 100644 --- a/fs/smb/client/trace.c +++ b/fs/smb/client/trace.c @@ -4,5 +4,6 @@   *   *   Author(s): Steve French <stfrench@microsoft.com>   */ +#include "cifsglob.h"  #define CREATE_TRACE_POINTS  #include "trace.h" diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index b88fa04f5792..029910d56c22 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -178,7 +178,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,  			memcpy(pacl, value, size);  			if (pTcon->ses->server->ops->set_acl) {  				int aclflags = 0; -				rc = 0;  				switch (handler->flags) {  				case XATTR_CIFS_NTSD_FULL: diff --git a/fs/smb/common/cifsglob.h b/fs/smb/common/cifsglob.h new file mode 100644 index 000000000000..00fd215e3eb5 --- /dev/null +++ b/fs/smb/common/cifsglob.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +/* + * + *   Copyright (C) International Business Machines  Corp., 2002,2008 + *   Author(s): Steve French (sfrench@us.ibm.com) + *              Jeremy Allison (jra@samba.org) + * + */ +#ifndef _COMMON_CIFS_GLOB_H +#define _COMMON_CIFS_GLOB_H + +static inline void inc_rfc1001_len(void *buf, int count) +{ +	be32_add_cpu((__be32 *)buf, count); +} + +#define SMB1_VERSION_STRING	"1.0" +#define SMB20_VERSION_STRING    "2.0" +#define SMB21_VERSION_STRING	"2.1" +#define SMBDEFAULT_VERSION_STRING "default" +#define SMB3ANY_VERSION_STRING "3" +#define SMB30_VERSION_STRING	"3.0" +#define SMB302_VERSION_STRING	"3.02" +#define ALT_SMB302_VERSION_STRING "3.0.2" +#define SMB311_VERSION_STRING	"3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" + +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +#endif	/* _COMMON_CIFS_GLOB_H */ diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index db22a1d0546b..ee5a90d691c8 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -142,7 +142,15 @@ struct smbdirect_socket {  		} mem;  		/* -		 * The credit state for the send side +		 * The local credit state for ib_post_send() +		 */ +		struct { +			atomic_t count; +			wait_queue_head_t wait_queue; +		} lcredits; + +		/* +		 * The remote credit state for the send side  		 */  		struct {  			atomic_t count; @@ -337,6 +345,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)  	INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);  	disable_delayed_work_sync(&sc->idle.timer_work); +	atomic_set(&sc->send_io.lcredits.count, 0); +	init_waitqueue_head(&sc->send_io.lcredits.wait_queue); +  	atomic_set(&sc->send_io.credits.count, 0);  	init_waitqueue_head(&sc->send_io.credits.wait_queue); @@ -437,13 +448,22 @@ enum smbdirect_mr_state {  	SMBDIRECT_MR_READY,  	SMBDIRECT_MR_REGISTERED,  	SMBDIRECT_MR_INVALIDATED, -	SMBDIRECT_MR_ERROR +	SMBDIRECT_MR_ERROR, +	SMBDIRECT_MR_DISABLED  };  struct smbdirect_mr_io {  	struct smbdirect_socket *socket;  	struct ib_cqe cqe; +	/* +	 * We can have up to two references: +	 * 1. by the connection +	 * 2. by the registration +	 */ +	struct kref kref; +	struct mutex mutex; +  	struct list_head list;  	enum smbdirect_mr_state state; diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 6fa025374f2f..1c181ef99929 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -147,14 +147,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)  int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)  {  	struct ksmbd_session_rpc *entry; -	int method; -	down_read(&sess->rpc_lock); +	lockdep_assert_held(&sess->rpc_lock);  	entry = xa_load(&sess->rpc_handle_list, id); -	method = entry ? entry->method : 0; -	up_read(&sess->rpc_lock); -	return method; +	return entry ? entry->method : 0;  }  void ksmbd_session_destroy(struct ksmbd_session *sess) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ab1d45fcebde..f901ae18e68a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1806,6 +1806,7 @@ int smb2_sess_setup(struct ksmbd_work *work)  		if (ksmbd_conn_need_reconnect(conn)) {  			rc = -EFAULT; +			ksmbd_user_session_put(sess);  			sess = NULL;  			goto out_err;  		} @@ -4625,8 +4626,15 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess,  	 * pipe without opening it, checking error condition here  	 */  	id = req->VolatileFileId; -	if (!ksmbd_session_rpc_method(sess, id)) + +	lockdep_assert_not_held(&sess->rpc_lock); + +	down_read(&sess->rpc_lock); +	if (!ksmbd_session_rpc_method(sess, id)) { +		up_read(&sess->rpc_lock);  		return -ENOENT; +	} +	up_read(&sess->rpc_lock);  	ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n",  		    req->FileInfoClass, req->VolatileFileId); @@ -6824,6 +6832,7 @@ int smb2_read(struct ksmbd_work *work)  	nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf);  	if (nbytes < 0) { +		kvfree(aux_payload_buf);  		err = nbytes;  		goto out;  	} diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index d742ba754348..863716207a0d 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -10,6 +10,7 @@  #include "glob.h"  #include "nterr.h" +#include "../common/cifsglob.h"  #include "../common/smb2pdu.h"  #include "smb2pdu.h" @@ -26,16 +27,8 @@  #define SMB311_PROT		6  #define BAD_PROT		0xFFFF -#define SMB1_VERSION_STRING	"1.0" -#define SMB20_VERSION_STRING	"2.0" -#define SMB21_VERSION_STRING	"2.1" -#define SMB30_VERSION_STRING	"3.0" -#define SMB302_VERSION_STRING	"3.02" -#define SMB311_VERSION_STRING	"3.1.1" -  #define SMB_ECHO_INTERVAL	(60 * HZ) -#define CIFS_DEFAULT_IOSIZE	(64 * 1024)  #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */  #define MAX_STREAM_PROT_LEN	0x00FFFFFF @@ -464,9 +457,4 @@ static inline unsigned int get_rfc1002_len(void *buf)  {  	return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;  } - -static inline void inc_rfc1001_len(void *buf, int count) -{ -	be32_add_cpu((__be32 *)buf, count); -}  #endif /* __SMB_COMMON_H__ */ diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2aa1b29bea08..2c08cccfa680 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -263,10 +263,16 @@ static void ipc_msg_handle_free(int handle)  static int handle_response(int type, void *payload, size_t sz)  { -	unsigned int handle = *(unsigned int *)payload; +	unsigned int handle;  	struct ipc_msg_table_entry *entry;  	int ret = 0; +	/* Prevent 4-byte read beyond declared payload size */ +	if (sz < sizeof(unsigned int)) +		return -EINVAL; + +	handle = *(unsigned int *)payload; +  	ipc_update_last_active();  	down_read(&ipc_msg_table_lock);  	hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) { @@ -825,6 +831,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle  	if (!msg)  		return NULL; +	lockdep_assert_not_held(&sess->rpc_lock); + +	down_read(&sess->rpc_lock);  	msg->type = KSMBD_EVENT_RPC_REQUEST;  	req = (struct ksmbd_rpc_command *)msg->payload;  	req->handle = handle; @@ -833,6 +842,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle  	req->flags |= KSMBD_RPC_WRITE_METHOD;  	req->payload_sz = payload_sz;  	memcpy(req->payload, payload, payload_sz); +	up_read(&sess->rpc_lock);  	resp = ipc_msg_send_request(msg, req->handle);  	ipc_msg_free(msg); @@ -849,6 +859,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle)  	if (!msg)  		return NULL; +	lockdep_assert_not_held(&sess->rpc_lock); + +	down_read(&sess->rpc_lock);  	msg->type = KSMBD_EVENT_RPC_REQUEST;  	req = (struct ksmbd_rpc_command *)msg->payload;  	req->handle = handle; @@ -856,6 +869,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle)  	req->flags |= rpc_context_flags(sess);  	req->flags |= KSMBD_RPC_READ_METHOD;  	req->payload_sz = 0; +	up_read(&sess->rpc_lock);  	resp = ipc_msg_send_request(msg, req->handle);  	ipc_msg_free(msg); @@ -876,6 +890,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle  	if (!msg)  		return NULL; +	lockdep_assert_not_held(&sess->rpc_lock); + +	down_read(&sess->rpc_lock);  	msg->type = KSMBD_EVENT_RPC_REQUEST;  	req = (struct ksmbd_rpc_command *)msg->payload;  	req->handle = handle; @@ -884,6 +901,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle  	req->flags |= KSMBD_RPC_IOCTL_METHOD;  	req->payload_sz = payload_sz;  	memcpy(req->payload, payload, payload_sz); +	up_read(&sess->rpc_lock);  	resp = ipc_msg_send_request(msg, req->handle);  	ipc_msg_free(msg); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index b3077766d6ec..7d86553fcc7c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -219,6 +219,7 @@ static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)  	 * in order to notice the broken connection.  	 */  	wake_up_all(&sc->status_wait); +	wake_up_all(&sc->send_io.lcredits.wait_queue);  	wake_up_all(&sc->send_io.credits.wait_queue);  	wake_up_all(&sc->send_io.pending.zero_wait_queue);  	wake_up_all(&sc->recv_io.reassembly.wait_queue); @@ -417,9 +418,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)  	sc->ib.dev = sc->rdma.cm_id->device; -	INIT_WORK(&sc->recv_io.posted.refill_work, -		  smb_direct_post_recv_credits); -	INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);  	INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);  	conn = ksmbd_conn_alloc(); @@ -450,11 +448,10 @@ static void free_transport(struct smb_direct_transport *t)  	struct smbdirect_recv_io *recvmsg;  	disable_work_sync(&sc->disconnect_work); -	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { +	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)  		smb_direct_disconnect_rdma_work(&sc->disconnect_work); -		wait_event_interruptible(sc->status_wait, -					 sc->status == SMBDIRECT_SOCKET_DISCONNECTED); -	} +	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) +		wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);  	/*  	 * Wake up all waiters in all wait queues @@ -469,9 +466,11 @@ static void free_transport(struct smb_direct_transport *t)  	disable_delayed_work_sync(&sc->idle.timer_work);  	disable_work_sync(&sc->idle.immediate_work); +	if (sc->rdma.cm_id) +		rdma_lock_handler(sc->rdma.cm_id); +  	if (sc->ib.qp) {  		ib_drain_qp(sc->ib.qp); -		ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);  		sc->ib.qp = NULL;  		rdma_destroy_qp(sc->rdma.cm_id);  	} @@ -498,8 +497,10 @@ static void free_transport(struct smb_direct_transport *t)  		ib_free_cq(sc->ib.recv_cq);  	if (sc->ib.pd)  		ib_dealloc_pd(sc->ib.pd); -	if (sc->rdma.cm_id) +	if (sc->rdma.cm_id) { +		rdma_unlock_handler(sc->rdma.cm_id);  		rdma_destroy_id(sc->rdma.cm_id); +	}  	smb_direct_destroy_pools(sc);  	ksmbd_conn_free(KSMBD_TRANS(t)->conn); @@ -524,6 +525,12 @@ static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,  {  	int i; +	/* +	 * The list needs to be empty! +	 * The caller should take care of it. +	 */ +	WARN_ON_ONCE(!list_empty(&msg->sibling_list)); +  	if (msg->num_sge > 0) {  		ib_dma_unmap_single(sc->ib.dev,  				    msg->sge[0].addr, msg->sge[0].length, @@ -909,9 +916,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work)  static void send_done(struct ib_cq *cq, struct ib_wc *wc)  { -	struct smbdirect_send_io *sendmsg, *sibling; +	struct smbdirect_send_io *sendmsg, *sibling, *next;  	struct smbdirect_socket *sc; -	struct list_head *pos, *prev, *end; +	int lcredits = 0;  	sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);  	sc = sendmsg->socket; @@ -920,27 +927,31 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)  		    ib_wc_status_msg(wc->status), wc->status,  		    wc->opcode); +	/* +	 * Free possible siblings and then the main send_io +	 */ +	list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { +		list_del_init(&sibling->sibling_list); +		smb_direct_free_sendmsg(sc, sibling); +		lcredits += 1; +	} +	/* Note this frees wc->wr_cqe, but not wc */ +	smb_direct_free_sendmsg(sc, sendmsg); +	lcredits += 1; +  	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {  		pr_err("Send error. status='%s (%d)', opcode=%d\n",  		       ib_wc_status_msg(wc->status), wc->status,  		       wc->opcode);  		smb_direct_disconnect_rdma_connection(sc); +		return;  	} +	atomic_add(lcredits, &sc->send_io.lcredits.count); +	wake_up(&sc->send_io.lcredits.wait_queue); +  	if (atomic_dec_and_test(&sc->send_io.pending.count))  		wake_up(&sc->send_io.pending.zero_wait_queue); - -	/* iterate and free the list of messages in reverse. the list's head -	 * is invalid. -	 */ -	for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; -	     prev != end; pos = prev, prev = prev->prev) { -		sibling = container_of(pos, struct smbdirect_send_io, sibling_list); -		smb_direct_free_sendmsg(sc, sibling); -	} - -	sibling = container_of(pos, struct smbdirect_send_io, sibling_list); -	smb_direct_free_sendmsg(sc, sibling);  }  static int manage_credits_prior_sending(struct smbdirect_socket *sc) @@ -988,8 +999,6 @@ static int smb_direct_post_send(struct smbdirect_socket *sc,  	ret = ib_post_send(sc->ib.qp, wr, NULL);  	if (ret) {  		pr_err("failed to post send: %d\n", ret); -		if (atomic_dec_and_test(&sc->send_io.pending.count)) -			wake_up(&sc->send_io.pending.zero_wait_queue);  		smb_direct_disconnect_rdma_connection(sc);  	}  	return ret; @@ -1032,19 +1041,29 @@ static int smb_direct_flush_send_list(struct smbdirect_socket *sc,  	last->wr.send_flags = IB_SEND_SIGNALED;  	last->wr.wr_cqe = &last->cqe; +	/* +	 * Remove last from send_ctx->msg_list +	 * and splice the rest of send_ctx->msg_list +	 * to last->sibling_list. +	 * +	 * send_ctx->msg_list is a valid empty list +	 * at the end. +	 */ +	list_del_init(&last->sibling_list); +	list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); +	send_ctx->wr_cnt = 0; +  	ret = smb_direct_post_send(sc, &first->wr); -	if (!ret) { -		smb_direct_send_ctx_init(send_ctx, -					 send_ctx->need_invalidate_rkey, -					 send_ctx->remote_key); -	} else { -		atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count); -		wake_up(&sc->send_io.credits.wait_queue); -		list_for_each_entry_safe(first, last, &send_ctx->msg_list, -					 sibling_list) { -			smb_direct_free_sendmsg(sc, first); +	if (ret) { +		struct smbdirect_send_io *sibling, *next; + +		list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { +			list_del_init(&sibling->sibling_list); +			smb_direct_free_sendmsg(sc, sibling);  		} +		smb_direct_free_sendmsg(sc, last);  	} +  	return ret;  } @@ -1070,6 +1089,23 @@ static int wait_for_credits(struct smbdirect_socket *sc,  	} while (true);  } +static int wait_for_send_lcredit(struct smbdirect_socket *sc, +				 struct smbdirect_send_batch *send_ctx) +{ +	if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { +		int ret; + +		ret = smb_direct_flush_send_list(sc, send_ctx, false); +		if (ret) +			return ret; +	} + +	return wait_for_credits(sc, +				&sc->send_io.lcredits.wait_queue, +				&sc->send_io.lcredits.count, +				1); +} +  static int wait_for_send_credits(struct smbdirect_socket *sc,  				 struct smbdirect_send_batch *send_ctx)  { @@ -1257,9 +1293,13 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,  	int data_length;  	struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; +	ret = wait_for_send_lcredit(sc, send_ctx); +	if (ret) +		goto lcredit_failed; +  	ret = wait_for_send_credits(sc, send_ctx);  	if (ret) -		return ret; +		goto credit_failed;  	data_length = 0;  	for (i = 0; i < niov; i++) @@ -1267,10 +1307,8 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,  	ret = smb_direct_create_header(sc, data_length, remaining_data_length,  				       &msg); -	if (ret) { -		atomic_inc(&sc->send_io.credits.count); -		return ret; -	} +	if (ret) +		goto header_failed;  	for (i = 0; i < niov; i++) {  		struct ib_sge *sge; @@ -1308,7 +1346,11 @@ static int smb_direct_post_send_data(struct smbdirect_socket *sc,  	return 0;  err:  	smb_direct_free_sendmsg(sc, msg); +header_failed:  	atomic_inc(&sc->send_io.credits.count); +credit_failed: +	atomic_inc(&sc->send_io.lcredits.count); +lcredit_failed:  	return ret;  } @@ -1574,18 +1616,14 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,  					     get_buf_page_count(desc_buf, desc_buf_len),  					     msg->sg_list, SG_CHUNK_SIZE);  		if (ret) { -			kfree(msg);  			ret = -ENOMEM; -			goto out; +			goto free_msg;  		}  		ret = get_sg_list(desc_buf, desc_buf_len,  				  msg->sgt.sgl, msg->sgt.orig_nents); -		if (ret < 0) { -			sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); -			kfree(msg); -			goto out; -		} +		if (ret < 0) +			goto free_table;  		ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,  				       msg->sgt.sgl, @@ -1596,9 +1634,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,  				       is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);  		if (ret < 0) {  			pr_err("failed to init rdma_rw_ctx: %d\n", ret); -			sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); -			kfree(msg); -			goto out; +			goto free_table;  		}  		list_add_tail(&msg->list, &msg_list); @@ -1630,6 +1666,12 @@ out:  	atomic_add(credits_needed, &sc->rw_io.credits.count);  	wake_up(&sc->rw_io.credits.wait_queue);  	return ret; + +free_table: +	sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); +free_msg: +	kfree(msg); +	goto out;  }  static int smb_direct_rdma_write(struct ksmbd_transport *t, @@ -1687,10 +1729,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,  	}  	case RDMA_CM_EVENT_DEVICE_REMOVAL:  	case RDMA_CM_EVENT_DISCONNECTED: { -		ib_drain_qp(sc->ib.qp); -  		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;  		smb_direct_disconnect_rdma_work(&sc->disconnect_work); +		if (sc->ib.qp) +			ib_drain_qp(sc->ib.qp);  		break;  	}  	case RDMA_CM_EVENT_CONNECT_ERROR: { @@ -1864,27 +1906,17 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)  		goto out_err;  	} -	smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);  	return 0;  out_err:  	put_recvmsg(sc, recvmsg);  	return ret;  } -static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) -{ -	return min_t(unsigned int, -		     sc->ib.dev->attrs.max_fast_reg_page_list_len, -		     256); -} - -static int smb_direct_init_params(struct smbdirect_socket *sc, -				  struct ib_qp_cap *cap) +static int smb_direct_init_params(struct smbdirect_socket *sc)  {  	struct smbdirect_socket_parameters *sp = &sc->parameters; -	struct ib_device *device = sc->ib.dev; -	int max_send_sges, max_rw_wrs, max_send_wrs; -	unsigned int max_sge_per_wr, wrs_per_credit; +	int max_send_sges; +	unsigned int maxpages;  	/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,  	 * SMB2 response could be mapped. @@ -1895,67 +1927,20 @@ static int smb_direct_init_params(struct smbdirect_socket *sc,  		return -EINVAL;  	} -	/* Calculate the number of work requests for RDMA R/W. -	 * The maximum number of pages which can be registered -	 * with one Memory region can be transferred with one -	 * R/W credit. And at least 4 work requests for each credit -	 * are needed for MR registration, RDMA R/W, local & remote -	 * MR invalidation. -	 */ -	sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc); -	sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, -					 (sc->rw_io.credits.num_pages - 1) * -					 PAGE_SIZE); - -	max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, -			       device->attrs.max_sge_rd); -	max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, -			       max_send_sges); -	wrs_per_credit = max_t(unsigned int, 4, -			       DIV_ROUND_UP(sc->rw_io.credits.num_pages, -					    max_sge_per_wr) + 1); -	max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; - -	max_send_wrs = sp->send_credit_target + max_rw_wrs; -	if (max_send_wrs > device->attrs.max_cqe || -	    max_send_wrs > device->attrs.max_qp_wr) { -		pr_err("consider lowering send_credit_target = %d\n", -		       sp->send_credit_target); -		pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", -		       device->attrs.max_cqe, device->attrs.max_qp_wr); -		return -EINVAL; -	} +	atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); -	if (sp->recv_credit_max > device->attrs.max_cqe || -	    sp->recv_credit_max > device->attrs.max_qp_wr) { -		pr_err("consider lowering receive_credit_max = %d\n", -		       sp->recv_credit_max); -		pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", -		       device->attrs.max_cqe, device->attrs.max_qp_wr); -		return -EINVAL; -	} - -	if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) { -		pr_err("warning: device max_send_sge = %d too small\n", -		       device->attrs.max_send_sge); -		return -EINVAL; -	} -	if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { -		pr_err("warning: device max_recv_sge = %d too small\n", -		       device->attrs.max_recv_sge); -		return -EINVAL; -	} +	maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); +	sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, +						  sc->rdma.cm_id->port_num, +						  maxpages); +	sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); +	/* add one extra in order to handle unaligned pages */ +	sc->rw_io.credits.max += 1;  	sc->recv_io.credits.target = 1;  	atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); -	cap->max_send_wr = max_send_wrs; -	cap->max_recv_wr = sp->recv_credit_max; -	cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; -	cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; -	cap->max_inline_data = 0; -	cap->max_rdma_ctxs = sc->rw_io.credits.max;  	return 0;  } @@ -2029,13 +2014,129 @@ err:  	return -ENOMEM;  } -static int smb_direct_create_qpair(struct smbdirect_socket *sc, -				   struct ib_qp_cap *cap) +static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) +{ +	/* +	 * This could be split out of rdma_rw_init_qp() +	 * and be a helper function next to rdma_rw_mr_factor() +	 * +	 * We can't check unlikely(rdma_rw_force_mr) here, +	 * but that is most likely 0 anyway. +	 */ +	u32 factor; + +	WARN_ON_ONCE(attr->port_num == 0); + +	/* +	 * Each context needs at least one RDMA READ or WRITE WR. +	 * +	 * For some hardware we might need more, eventually we should ask the +	 * HCA driver for a multiplier here. +	 */ +	factor = 1; + +	/* +	 * If the device needs MRs to perform RDMA READ or WRITE operations, +	 * we'll need two additional MRs for the registrations and the +	 * invalidation. +	 */ +	if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) +		factor += 2;	/* inv + reg */ + +	return factor * attr->cap.max_rdma_ctxs; +} + +static int smb_direct_create_qpair(struct smbdirect_socket *sc)  {  	struct smbdirect_socket_parameters *sp = &sc->parameters;  	int ret; +	struct ib_qp_cap qp_cap;  	struct ib_qp_init_attr qp_attr; -	int pages_per_rw; +	u32 max_send_wr; +	u32 rdma_send_wr; + +	/* +	 * Note that {rdma,ib}_create_qp() will call +	 * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. +	 * It will adjust cap->max_send_wr to the required +	 * number of additional WRs for the RDMA RW operations. +	 * It will cap cap->max_send_wr to the device limit. +	 * +	 * +1 for ib_drain_qp +	 */ +	qp_cap.max_send_wr = sp->send_credit_target + 1; +	qp_cap.max_recv_wr = sp->recv_credit_max + 1; +	qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; +	qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; +	qp_cap.max_inline_data = 0; +	qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; + +	/* +	 * Find out the number of max_send_wr +	 * after rdma_rw_init_qp() adjusted it. +	 * +	 * We only do it on a temporary variable, +	 * as rdma_create_qp() will trigger +	 * rdma_rw_init_qp() again. +	 */ +	memset(&qp_attr, 0, sizeof(qp_attr)); +	qp_attr.cap = qp_cap; +	qp_attr.port_num = sc->rdma.cm_id->port_num; +	rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); +	max_send_wr = qp_cap.max_send_wr + rdma_send_wr; + +	if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || +	    qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { +		pr_err("Possible CQE overrun: max_send_wr %d\n", +		       qp_cap.max_send_wr); +		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", +		       IB_DEVICE_NAME_MAX, +		       sc->ib.dev->name, +		       sc->ib.dev->attrs.max_cqe, +		       sc->ib.dev->attrs.max_qp_wr); +		pr_err("consider lowering send_credit_target = %d\n", +		       sp->send_credit_target); +		return -EINVAL; +	} + +	if (qp_cap.max_rdma_ctxs && +	    (max_send_wr >= sc->ib.dev->attrs.max_cqe || +	     max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { +		pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", +		       rdma_send_wr, qp_cap.max_send_wr, max_send_wr); +		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", +		       IB_DEVICE_NAME_MAX, +		       sc->ib.dev->name, +		       sc->ib.dev->attrs.max_cqe, +		       sc->ib.dev->attrs.max_qp_wr); +		pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", +		       sp->send_credit_target, qp_cap.max_rdma_ctxs); +		return -EINVAL; +	} + +	if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || +	    qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { +		pr_err("Possible CQE overrun: max_recv_wr %d\n", +		       qp_cap.max_recv_wr); +		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", +		       IB_DEVICE_NAME_MAX, +		       sc->ib.dev->name, +		       sc->ib.dev->attrs.max_cqe, +		       sc->ib.dev->attrs.max_qp_wr); +		pr_err("consider lowering receive_credit_max = %d\n", +		       sp->recv_credit_max); +		return -EINVAL; +	} + +	if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || +	    qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { +		pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", +		       IB_DEVICE_NAME_MAX, +		       sc->ib.dev->name, +		       sc->ib.dev->attrs.max_send_sge, +		       sc->ib.dev->attrs.max_recv_sge); +		return -EINVAL; +	}  	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);  	if (IS_ERR(sc->ib.pd)) { @@ -2046,8 +2147,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,  	}  	sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, -					 sp->send_credit_target + -					 cap->max_rdma_ctxs, +					 max_send_wr,  					 IB_POLL_WORKQUEUE);  	if (IS_ERR(sc->ib.send_cq)) {  		pr_err("Can't create RDMA send CQ\n"); @@ -2057,7 +2157,7 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,  	}  	sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, -					 sp->recv_credit_max, +					 qp_cap.max_recv_wr,  					 IB_POLL_WORKQUEUE);  	if (IS_ERR(sc->ib.recv_cq)) {  		pr_err("Can't create RDMA recv CQ\n"); @@ -2066,10 +2166,18 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,  		goto err;  	} +	/* +	 * We reset completely here! +	 * As the above use was just temporary +	 * to calc max_send_wr and rdma_send_wr. +	 * +	 * rdma_create_qp() will trigger rdma_rw_init_qp() +	 * again if max_rdma_ctxs is not 0. +	 */  	memset(&qp_attr, 0, sizeof(qp_attr));  	qp_attr.event_handler = smb_direct_qpair_handler;  	qp_attr.qp_context = sc; -	qp_attr.cap = *cap; +	qp_attr.cap = qp_cap;  	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;  	qp_attr.qp_type = IB_QPT_RC;  	qp_attr.send_cq = sc->ib.send_cq; @@ -2085,18 +2193,6 @@ static int smb_direct_create_qpair(struct smbdirect_socket *sc,  	sc->ib.qp = sc->rdma.cm_id->qp;  	sc->rdma.cm_id->event_handler = smb_direct_cm_handler; -	pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; -	if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { -		ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, -				      sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG, -				      sc->rw_io.credits.num_pages, 0); -		if (ret) { -			pr_err("failed to init mr pool count %zu pages %zu\n", -			       sc->rw_io.credits.max, sc->rw_io.credits.num_pages); -			goto err; -		} -	} -  	return 0;  err:  	if (sc->ib.qp) { @@ -2154,8 +2250,8 @@ static int smb_direct_prepare(struct ksmbd_transport *t)  		return -ECONNABORTED;  	ret = smb_direct_check_recvmsg(recvmsg); -	if (ret == -ECONNABORTED) -		goto out; +	if (ret) +		goto put;  	req = (struct smbdirect_negotiate_req *)recvmsg->packet;  	sp->max_recv_size = min_t(int, sp->max_recv_size, @@ -2170,23 +2266,46 @@ static int smb_direct_prepare(struct ksmbd_transport *t)  	sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);  	sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); -	ret = smb_direct_send_negotiate_response(sc, ret); -out: +put:  	spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);  	sc->recv_io.reassembly.queue_length--;  	list_del(&recvmsg->list);  	spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);  	put_recvmsg(sc, recvmsg); +	if (ret == -ECONNABORTED) +		return ret; + +	if (ret) +		goto respond; + +	/* +	 * We negotiated with success, so we need to refill the recv queue. +	 * We do that with sc->idle.immediate_work still being disabled +	 * via smbdirect_socket_init(), so that queue_work(sc->workqueue, +	 * &sc->idle.immediate_work) in smb_direct_post_recv_credits() +	 * is a no-op. +	 * +	 * The message that grants the credits to the client is +	 * the negotiate response. +	 */ +	INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); +	smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); +	if (unlikely(sc->first_error)) +		return sc->first_error; +	INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + +respond: +	ret = smb_direct_send_negotiate_response(sc, ret); +  	return ret;  }  static int smb_direct_connect(struct smbdirect_socket *sc)  { -	struct ib_qp_cap qp_cap;  	int ret; -	ret = smb_direct_init_params(sc, &qp_cap); +	ret = smb_direct_init_params(sc);  	if (ret) {  		pr_err("Can't configure RDMA parameters\n");  		return ret; @@ -2198,7 +2317,7 @@ static int smb_direct_connect(struct smbdirect_socket *sc)  		return ret;  	} -	ret = smb_direct_create_qpair(sc, &qp_cap); +	ret = smb_direct_create_qpair(sc);  	if (ret) {  		pr_err("Can't accept RDMA client: %d\n", ret);  		return ret; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2d78e94072a0..e142bac4f9f8 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -498,17 +498,26 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,  }  EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); -static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, +static int sysfs_group_attrs_change_owner(struct kobject *kobj, +					  struct kernfs_node *grp_kn,  					  const struct attribute_group *grp,  					  struct iattr *newattrs)  {  	struct kernfs_node *kn; -	int error; +	int error, i; +	umode_t mode;  	if (grp->attrs) {  		struct attribute *const *attr; -		for (attr = grp->attrs; *attr; attr++) { +		for (i = 0, attr = grp->attrs; *attr; i++, attr++) { +			if (grp->is_visible) { +				mode = grp->is_visible(kobj, *attr, i); +				if (mode & SYSFS_GROUP_INVISIBLE) +					break; +				if (!mode) +					continue; +			}  			kn = kernfs_find_and_get(grp_kn, (*attr)->name);  			if (!kn)  				return -ENOENT; @@ -523,7 +532,14 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,  	if (grp->bin_attrs) {  		const struct bin_attribute *const *bin_attr; -		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { +		for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { +			if (grp->is_bin_visible) { +				mode = grp->is_bin_visible(kobj, *bin_attr, i); +				if (mode & SYSFS_GROUP_INVISIBLE) +					break; +				if (!mode) +					continue; +			}  			kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);  			if (!kn)  				return -ENOENT; @@ -573,7 +589,7 @@ int sysfs_group_change_owner(struct kobject *kobj,  	error = kernfs_setattr(grp_kn, &newattrs);  	if (!error) -		error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); +		error = sysfs_group_attrs_change_owner(kobj, grp_kn, grp, &newattrs);  	kernfs_put(grp_kn); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 8930d5254e1d..b99da294e9a3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -119,6 +119,15 @@ config XFS_RT  	  See the xfs man page in section 5 for additional information. +	  This option is mandatory to support zoned block devices. For these +	  devices, the realtime subvolume must be backed by a zoned block +	  device and a regular block device used as the main device (for +	  metadata). If the zoned block device is a host-managed SMR hard-disk +	  containing conventional zones at the beginning of its address space, +	  XFS will use the disk conventional zones as the main device and the +	  remaining sequential write required zones as the backing storage for +	  the realtime subvolume. +  	  If unsure, say N.  config XFS_DRAIN_INTENTS @@ -156,7 +165,7 @@ config XFS_ONLINE_SCRUB_STATS  	bool "XFS online metadata check usage data collection"  	default y  	depends on XFS_ONLINE_SCRUB -	select DEBUG_FS +	depends on DEBUG_FS  	help  	  If you say Y here, the kernel will gather usage data about  	  the online metadata check subsystem.  This includes the number diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d36a6ae0abe5..d4fcf591e63d 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -50,6 +50,12 @@ struct xfs_rtgroup {  		uint8_t			*rtg_rsum_cache;  		struct xfs_open_zone	*rtg_open_zone;  	}; + +	/* +	 * Count of outstanding GC operations for zoned XFS.  Any RTG with a +	 * non-zero rtg_gccount will not be picked as new GC victim. +	 */ +	atomic_t		rtg_gccount;  };  /* diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 26721fab5cab..091c79e432e5 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -376,6 +376,36 @@ out_incomplete:  	return error;  } +static uint +xchk_nlinks_ilock_dir( +	struct xfs_inode	*ip) +{ +	uint			lock_mode = XFS_ILOCK_SHARED; + +	/* +	 * We're going to scan the directory entries, so we must be ready to +	 * pull the data fork mappings into memory if they aren't already. +	 */ +	if (xfs_need_iread_extents(&ip->i_df)) +		lock_mode = XFS_ILOCK_EXCL; + +	/* +	 * We're going to scan the parent pointers, so we must be ready to +	 * pull the attr fork mappings into memory if they aren't already. +	 */ +	if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && +	    xfs_need_iread_extents(&ip->i_af)) +		lock_mode = XFS_ILOCK_EXCL; + +	/* +	 * Take the IOLOCK so that other threads cannot start a directory +	 * update while we're scanning. +	 */ +	lock_mode |= XFS_IOLOCK_SHARED; +	xfs_ilock(ip, lock_mode); +	return lock_mode; +} +  /* Walk a directory to bump the observed link counts of the children. */  STATIC int  xchk_nlinks_collect_dir( @@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(  		return 0;  	/* Prevent anyone from changing this directory while we walk it. */ -	xfs_ilock(dp, XFS_IOLOCK_SHARED); -	lock_mode = xfs_ilock_data_map_shared(dp); +	lock_mode = xchk_nlinks_ilock_dir(dp);  	/*  	 * The dotdot entry of an unlinked directory still points to the last @@ -452,7 +481,6 @@ out_abort:  	xchk_iscan_abort(&xnc->collect_iscan);  out_unlock:  	xfs_iunlock(dp, lock_mode); -	xfs_iunlock(dp, XFS_IOLOCK_SHARED);  	return error;  } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 773d959965dc..47edf3041631 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1751,7 +1751,7 @@ xfs_init_buftarg(  	const char			*descr)  {  	/* The maximum size of the buftarg is only known once the sb is read. */ -	btp->bt_nr_sectors = (xfs_daddr_t)-1; +	btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;  	/* Set up device logical sector size mask */  	btp->bt_logical_sectorsize = logical_sectorsize; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8fa7bdf59c91..e25cd2a160f3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;   */  struct xfs_buf; +#define XFS_BUF_DADDR_MAX	((xfs_daddr_t) S64_MAX)  #define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))  #define XBF_READ	 (1u << 0) /* buffer intended for reading from device */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f046d1215b04..b871dfde372b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -236,7 +236,6 @@ typedef struct xfs_mount {  	bool			m_update_sb;	/* sb needs update in mount */  	unsigned int		m_max_open_zones;  	unsigned int		m_zonegc_low_space; -	struct xfs_mru_cache	*m_zone_cache;  /* Inode to open zone cache */  	/* max_atomic_write mount option value */  	unsigned long long	m_awu_max_bytes; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e85a156dc17d..1067ebb3b001 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -102,7 +102,7 @@ static const struct constant_table dax_param_enums[] = {   * Table driven mount option parser.   */  enum { -	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, +	Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,  	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,  	Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,  	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, @@ -114,7 +114,21 @@ enum {  	Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,  }; +#define fsparam_dead(NAME) \ +	__fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) +  static const struct fs_parameter_spec xfs_fs_parameters[] = { +	/* +	 * These mount options were supposed to be deprecated in September 2025 +	 * but the deprecation warning was buggy, so not all users were +	 * notified.  The deprecation is now obnoxiously loud and postponed to +	 * September 2030. +	 */ +	fsparam_dead("attr2"), +	fsparam_dead("noattr2"), +	fsparam_dead("ikeep"), +	fsparam_dead("noikeep"), +  	fsparam_u32("logbufs",		Opt_logbufs),  	fsparam_string("logbsize",	Opt_logbsize),  	fsparam_string("logdev",	Opt_logdev), @@ -786,6 +800,12 @@ xfs_fs_evict_inode(  	truncate_inode_pages_final(&inode->i_data);  	clear_inode(inode); + +	if (IS_ENABLED(CONFIG_XFS_RT) && +	    S_ISREG(inode->i_mode) && inode->i_private) { +		xfs_open_zone_put(inode->i_private); +		inode->i_private = NULL; +	}  }  static void @@ -1373,16 +1393,25 @@ suffix_kstrtoull(  static inline void  xfs_fs_warn_deprecated(  	struct fs_context	*fc, -	struct fs_parameter	*param, -	uint64_t		flag, -	bool			value) +	struct fs_parameter	*param)  { -	/* Don't print the warning if reconfiguring and current mount point -	 * already had the flag set +	/* +	 * Always warn about someone passing in a deprecated mount option. +	 * Previously we wouldn't print the warning if we were reconfiguring +	 * and current mount point already had the flag set, but that was not +	 * the right thing to do. +	 * +	 * Many distributions mount the root filesystem with no options in the +	 * initramfs and rely on mount -a to remount the root fs with the +	 * options in fstab.  However, the old behavior meant that there would +	 * never be a warning about deprecated mount options for the root fs in +	 * /etc/fstab.  On a single-fs system, that means no warning at all. +	 * +	 * Compounding this problem are distribution scripts that copy +	 * /proc/mounts to fstab, which means that we can't remove mount +	 * options unless we're 100% sure they have only ever been advertised +	 * in /proc/mounts in response to explicitly provided mount options.  	 */ -	if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && -            !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) -		return;  	xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);  } @@ -1408,6 +1437,9 @@ xfs_fs_parse_param(  		return opt;  	switch (opt) { +	case Op_deprecated: +		xfs_fs_warn_deprecated(fc, param); +		return 0;  	case Opt_logbufs:  		parsing_mp->m_logbufs = result.uint_32;  		return 0; @@ -1528,7 +1560,6 @@ xfs_fs_parse_param(  		xfs_mount_set_dax_mode(parsing_mp, result.uint_32);  		return 0;  #endif -	/* Following mount options will be removed in September 2025 */  	case Opt_max_open_zones:  		parsing_mp->m_max_open_zones = result.uint_32;  		return 0; @@ -2221,7 +2252,7 @@ xfs_init_fs_context(  	struct xfs_mount	*mp;  	int			i; -	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); +	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);  	if (!mp)  		return -ENOMEM; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1147bacb2da8..040402240807 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -26,14 +26,22 @@  #include "xfs_trace.h"  #include "xfs_mru_cache.h" +static void +xfs_open_zone_free_rcu( +	struct callback_head	*cb) +{ +	struct xfs_open_zone	*oz = container_of(cb, typeof(*oz), oz_rcu); + +	xfs_rtgroup_rele(oz->oz_rtg); +	kfree(oz); +} +  void  xfs_open_zone_put(  	struct xfs_open_zone	*oz)  { -	if (atomic_dec_and_test(&oz->oz_ref)) { -		xfs_rtgroup_rele(oz->oz_rtg); -		kfree(oz); -	} +	if (atomic_dec_and_test(&oz->oz_ref)) +		call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);  }  static inline uint32_t @@ -238,6 +246,14 @@ xfs_zoned_map_extent(  	 * If a data write raced with this GC write, keep the existing data in  	 * the data fork, mark our newly written GC extent as reclaimable, then  	 * move on to the next extent. +	 * +	 * Note that this can also happen when racing with operations that do +	 * not actually invalidate the data, but just move it to a different +	 * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the +	 * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE).  If the +	 * data was just moved around, GC fails to free the zone, but the zone +	 * becomes a GC candidate again as soon as all previous GC I/O has +	 * finished and these blocks will be moved out eventually.  	 */  	if (old_startblock != NULLFSBLOCK &&  	    old_startblock != data.br_startblock) @@ -614,14 +630,25 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)  }  /* - * Try to pack inodes that are written back after they were closed tight instead - * of trying to open new zones for them or spread them to the least recently - * used zone.  This optimizes the data layout for workloads that untar or copy - * a lot of small files.  Right now this does not separate multiple such + * Try to tightly pack small files that are written back after they were closed + * instead of trying to open new zones for them or spread them to the least + * recently used zone. This optimizes the data layout for workloads that untar + * or copy a lot of small files. Right now this does not separate multiple such   * streams.   */  static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)  { +	struct xfs_mount *mp = ip->i_mount; +	size_t zone_capacity = +		XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); + +	/* +	 * Do not pack write files that are already using a full zone to avoid +	 * fragmentation. +	 */ +	if (i_size_read(VFS_I(ip)) >= zone_capacity) +		return false; +  	return !inode_is_open_for_write(VFS_I(ip)) &&  		!(ip->i_diflags & XFS_DIFLAG_APPEND);  } @@ -746,97 +773,54 @@ xfs_mark_rtg_boundary(  }  /* - * Cache the last zone written to for an inode so that it is considered first - * for subsequent writes. - */ -struct xfs_zone_cache_item { -	struct xfs_mru_cache_elem	mru; -	struct xfs_open_zone		*oz; -}; - -static inline struct xfs_zone_cache_item * -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) -{ -	return container_of(mru, struct xfs_zone_cache_item, mru); -} - -static void -xfs_zone_cache_free_func( -	void				*data, -	struct xfs_mru_cache_elem	*mru) -{ -	struct xfs_zone_cache_item	*item = xfs_zone_cache_item(mru); - -	xfs_open_zone_put(item->oz); -	kfree(item); -} - -/*   * Check if we have a cached last open zone available for the inode and   * if yes return a reference to it.   */  static struct xfs_open_zone * -xfs_cached_zone( -	struct xfs_mount		*mp, -	struct xfs_inode		*ip) +xfs_get_cached_zone( +	struct xfs_inode	*ip)  { -	struct xfs_mru_cache_elem	*mru; -	struct xfs_open_zone		*oz; +	struct xfs_open_zone	*oz; -	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); -	if (!mru) -		return NULL; -	oz = xfs_zone_cache_item(mru)->oz; +	rcu_read_lock(); +	oz = VFS_I(ip)->i_private;  	if (oz) {  		/*  		 * GC only steals open zones at mount time, so no GC zones  		 * should end up in the cache.  		 */  		ASSERT(!oz->oz_is_gc); -		ASSERT(atomic_read(&oz->oz_ref) > 0); -		atomic_inc(&oz->oz_ref); +		if (!atomic_inc_not_zero(&oz->oz_ref)) +			oz = NULL;  	} -	xfs_mru_cache_done(mp->m_zone_cache); +	rcu_read_unlock(); +  	return oz;  }  /* - * Update the last used zone cache for a given inode. + * Stash our zone in the inode so that is is reused for future allocations.   * - * The caller must have a reference on the open zone. + * The open_zone structure will be pinned until either the inode is freed or + * until the cached open zone is replaced with a different one because the + * current one was full when we tried to use it.  This means we keep any + * open zone around forever as long as any inode that used it for the last + * write is cached, which slightly increases the memory use of cached inodes + * that were every written to, but significantly simplifies the cached zone + * lookup.  Because the open_zone is clearly marked as full when all data + * in the underlying RTG was written, the caching is always safe.   */  static void -xfs_zone_cache_create_association( -	struct xfs_inode		*ip, -	struct xfs_open_zone		*oz) +xfs_set_cached_zone( +	struct xfs_inode	*ip, +	struct xfs_open_zone	*oz)  { -	struct xfs_mount		*mp = ip->i_mount; -	struct xfs_zone_cache_item	*item = NULL; -	struct xfs_mru_cache_elem	*mru; +	struct xfs_open_zone	*old_oz; -	ASSERT(atomic_read(&oz->oz_ref) > 0);  	atomic_inc(&oz->oz_ref); - -	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); -	if (mru) { -		/* -		 * If we have an association already, update it to point to the -		 * new zone. -		 */ -		item = xfs_zone_cache_item(mru); -		xfs_open_zone_put(item->oz); -		item->oz = oz; -		xfs_mru_cache_done(mp->m_zone_cache); -		return; -	} - -	item = kmalloc(sizeof(*item), GFP_KERNEL); -	if (!item) { -		xfs_open_zone_put(oz); -		return; -	} -	item->oz = oz; -	xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); +	old_oz = xchg(&VFS_I(ip)->i_private, oz); +	if (old_oz) +		xfs_open_zone_put(old_oz);  }  static void @@ -880,15 +864,14 @@ xfs_zone_alloc_and_submit(  	 * the inode is still associated with a zone and use that if so.  	 */  	if (!*oz) -		*oz = xfs_cached_zone(mp, ip); +		*oz = xfs_get_cached_zone(ip);  	if (!*oz) {  select_zone:  		*oz = xfs_select_zone(mp, write_hint, pack_tight);  		if (!*oz)  			goto out_error; - -		xfs_zone_cache_create_association(ip, *oz); +		xfs_set_cached_zone(ip, *oz);  	}  	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -966,6 +949,12 @@ xfs_free_open_zones(  		xfs_open_zone_put(oz);  	}  	spin_unlock(&zi->zi_open_zones_lock); + +	/* +	 * Wait for all open zones to be freed so that they drop the group +	 * references: +	 */ +	rcu_barrier();  }  struct xfs_init_zones { @@ -1279,14 +1268,6 @@ xfs_mount_zones(  	error = xfs_zone_gc_mount(mp);  	if (error)  		goto out_free_zone_info; - -	/* -	 * Set up a mru cache to track inode to open zone for data placement -	 * purposes. The magic values for group count and life time is the -	 * same as the defaults for file streams, which seems sane enough. -	 */ -	xfs_mru_cache_create(&mp->m_zone_cache, mp, -			5000, 10, xfs_zone_cache_free_func);  	return 0;  out_free_zone_info: @@ -1300,5 +1281,4 @@ xfs_unmount_zones(  {  	xfs_zone_gc_unmount(mp);  	xfs_free_zone_info(mp->m_zone_info); -	xfs_mru_cache_destroy(mp->m_zone_cache);  } diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 064cd1a857a0..4ade54445532 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -114,6 +114,8 @@ struct xfs_gc_bio {  	/* Open Zone being written to */  	struct xfs_open_zone		*oz; +	struct xfs_rtgroup		*victim_rtg; +  	/* Bio used for reads and writes, including the bvec used by it */  	struct bio_vec			bv;  	struct bio			bio;	/* must be last */ @@ -264,6 +266,7 @@ xfs_zone_gc_iter_init(  	iter->rec_count = 0;  	iter->rec_idx = 0;  	iter->victim_rtg = victim_rtg; +	atomic_inc(&victim_rtg->rtg_gccount);  }  /* @@ -362,6 +365,7 @@ xfs_zone_gc_query(  	return 0;  done: +	atomic_dec(&iter->victim_rtg->rtg_gccount);  	xfs_rtgroup_rele(iter->victim_rtg);  	iter->victim_rtg = NULL;  	return 0; @@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from(  		if (!rtg)  			continue; +		/* +		 * If the zone is already undergoing GC, don't pick it again. +		 * +		 * This prevents us from picking one of the zones for which we +		 * already submitted GC I/O, but for which the remapping hasn't +		 * concluded yet.  This won't cause data corruption, but +		 * increases write amplification and slows down GC, so this is +		 * a bad thing. +		 */ +		if (atomic_read(&rtg->rtg_gccount)) { +			xfs_rtgroup_rele(rtg); +			continue; +		} +  		/* skip zones that are just waiting for a reset */  		if (rtg_rmap(rtg)->i_used_blocks == 0 ||  		    rtg_rmap(rtg)->i_used_blocks >= victim_used) { @@ -491,21 +509,6 @@ xfs_zone_gc_select_victim(  	struct xfs_rtgroup	*victim_rtg = NULL;  	unsigned int		bucket; -	if (xfs_is_shutdown(mp)) -		return false; - -	if (iter->victim_rtg) -		return true; - -	/* -	 * Don't start new work if we are asked to stop or park. -	 */ -	if (kthread_should_stop() || kthread_should_park()) -		return false; - -	if (!xfs_zoned_need_gc(mp)) -		return false; -  	spin_lock(&zi->zi_used_buckets_lock);  	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {  		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); @@ -703,6 +706,9 @@ xfs_zone_gc_start_chunk(  	chunk->scratch = &data->scratch[data->scratch_idx];  	chunk->data = data;  	chunk->oz = oz; +	chunk->victim_rtg = iter->victim_rtg; +	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); +	atomic_inc(&chunk->victim_rtg->rtg_gccount);  	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);  	bio->bi_end_io = xfs_zone_gc_end_io; @@ -725,6 +731,8 @@ static void  xfs_zone_gc_free_chunk(  	struct xfs_gc_bio	*chunk)  { +	atomic_dec(&chunk->victim_rtg->rtg_gccount); +	xfs_rtgroup_rele(chunk->victim_rtg);  	list_del(&chunk->entry);  	xfs_open_zone_put(chunk->oz);  	xfs_irele(chunk->ip); @@ -785,6 +793,10 @@ xfs_zone_gc_split_write(  	split_chunk->oz = chunk->oz;  	atomic_inc(&chunk->oz->oz_ref); +	split_chunk->victim_rtg = chunk->victim_rtg; +	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); +	atomic_inc(&chunk->victim_rtg->rtg_gccount); +  	chunk->offset += split_len;  	chunk->len -= split_len;  	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); @@ -975,6 +987,27 @@ xfs_zone_gc_reset_zones(  	} while (next);  } +static bool +xfs_zone_gc_should_start_new_work( +	struct xfs_zone_gc_data	*data) +{ +	if (xfs_is_shutdown(data->mp)) +		return false; +	if (!xfs_zone_gc_space_available(data)) +		return false; + +	if (!data->iter.victim_rtg) { +		if (kthread_should_stop() || kthread_should_park()) +			return false; +		if (!xfs_zoned_need_gc(data->mp)) +			return false; +		if (!xfs_zone_gc_select_victim(data)) +			return false; +	} + +	return true; +} +  /*   * Handle the work to read and write data for GC and to reset the zones,   * including handling all completions. @@ -982,7 +1015,7 @@ xfs_zone_gc_reset_zones(   * Note that the order of the chunks is preserved so that we don't undo the   * optimal order established by xfs_zone_gc_query().   */ -static bool +static void  xfs_zone_gc_handle_work(  	struct xfs_zone_gc_data	*data)  { @@ -996,30 +1029,22 @@ xfs_zone_gc_handle_work(  	zi->zi_reset_list = NULL;  	spin_unlock(&zi->zi_reset_list_lock); -	if (!xfs_zone_gc_select_victim(data) || -	    !xfs_zone_gc_space_available(data)) { -		if (list_empty(&data->reading) && -		    list_empty(&data->writing) && -		    list_empty(&data->resetting) && -		    !reset_list) -			return false; -	} - -	__set_current_state(TASK_RUNNING); -	try_to_freeze(); - -	if (reset_list) +	if (reset_list) { +		set_current_state(TASK_RUNNING);  		xfs_zone_gc_reset_zones(data, reset_list); +	}  	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {  		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)  			break; +		set_current_state(TASK_RUNNING);  		xfs_zone_gc_finish_reset(chunk);  	}  	list_for_each_entry_safe(chunk, next, &data->writing, entry) {  		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)  			break; +		set_current_state(TASK_RUNNING);  		xfs_zone_gc_finish_chunk(chunk);  	} @@ -1027,15 +1052,18 @@ xfs_zone_gc_handle_work(  	list_for_each_entry_safe(chunk, next, &data->reading, entry) {  		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)  			break; +		set_current_state(TASK_RUNNING);  		xfs_zone_gc_write_chunk(chunk);  	}  	blk_finish_plug(&plug); -	blk_start_plug(&plug); -	while (xfs_zone_gc_start_chunk(data)) -		; -	blk_finish_plug(&plug); -	return true; +	if (xfs_zone_gc_should_start_new_work(data)) { +		set_current_state(TASK_RUNNING); +		blk_start_plug(&plug); +		while (xfs_zone_gc_start_chunk(data)) +			; +		blk_finish_plug(&plug); +	}  }  /* @@ -1059,8 +1087,18 @@ xfs_zoned_gcd(  	for (;;) {  		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);  		xfs_set_zonegc_running(mp); -		if (xfs_zone_gc_handle_work(data)) + +		xfs_zone_gc_handle_work(data); + +		/* +		 * Only sleep if nothing set the state to running.  Else check for +		 * work again as someone might have queued up more work and woken +		 * us in the meantime. +		 */ +		if (get_current_state() == TASK_RUNNING) { +			try_to_freeze();  			continue; +		}  		if (list_empty(&data->reading) &&  		    list_empty(&data->writing) && diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index 35e6de3d25ed..4322e26dd99a 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -44,6 +44,8 @@ struct xfs_open_zone {  	 * the life time of an open zone.  	 */  	struct xfs_rtgroup	*oz_rtg; + +	struct rcu_head		oz_rcu;  };  /*  | 
