diff options
Diffstat (limited to 'fs/dax.c')
| -rw-r--r-- | fs/dax.c | 319 | 
1 files changed, 219 insertions, 100 deletions
| @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,  static void *dax_insert_mapping_entry(struct address_space *mapping,  				      struct vm_fault *vmf,  				      void *entry, sector_t sector, -				      unsigned long flags) +				      unsigned long flags, bool dirty)  {  	struct radix_tree_root *page_tree = &mapping->page_tree;  	void *new_entry;  	pgoff_t index = vmf->pgoff; -	if (vmf->flags & FAULT_FLAG_WRITE) +	if (dirty)  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);  	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,  		entry = new_entry;  	} -	if (vmf->flags & FAULT_FLAG_WRITE) +	if (dirty)  		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);  	spin_unlock_irq(&mapping->tree_lock); @@ -820,38 +820,42 @@ out:  }  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, -		struct block_device *bdev, struct dax_device *dax_dev, -		sector_t sector, size_t size, void *entry, -		struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)  { -	unsigned long vaddr = vmf->address; -	void *ret, *kaddr; +	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); +} + +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, +			 pfn_t *pfnp) +{ +	const sector_t sector = dax_iomap_sector(iomap, pos);  	pgoff_t pgoff; +	void *kaddr;  	int id, rc; -	pfn_t pfn; +	long length; -	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); +	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);  	if (rc)  		return rc; -  	id = dax_read_lock(); -	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); -	if (rc < 0) { -		dax_read_unlock(id); -		return rc; +	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), +				   &kaddr, pfnp); +	if (length < 0) { +		rc = length; +		goto out;  	} +	rc = -EINVAL; +	if (PFN_PHYS(length) < size) +		goto out; +	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) +		goto out; +	/* For larger pages we need devmap */ +	if (length > 1 && !pfn_t_devmap(*pfnp)) +		goto out; +	rc = 0; +out:  	dax_read_unlock(id); - -	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); -	if (IS_ERR(ret)) -		return PTR_ERR(ret); - -	trace_dax_insert_mapping(mapping->host, vmf, ret); -	if (vmf->flags & FAULT_FLAG_WRITE) -		return vm_insert_mixed_mkwrite(vma, vaddr, pfn); -	else -		return vm_insert_mixed(vma, vaddr, pfn); +	return rc;  }  /* @@ -877,7 +881,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,  	}  	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, -			RADIX_DAX_ZERO_PAGE); +			RADIX_DAX_ZERO_PAGE, false);  	if (IS_ERR(entry2)) {  		ret = VM_FAULT_SIGBUS;  		goto out; @@ -936,11 +940,6 @@ int __dax_zero_page_range(struct block_device *bdev,  }  EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ -	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); -} -  static loff_t  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,  		struct iomap *iomap) @@ -1080,19 +1079,33 @@ static int dax_fault_return(int error)  	return VM_FAULT_SIGBUS;  } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(unsigned long flags, +		struct vm_area_struct *vma, struct iomap *iomap) +{ +	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) +		&& (iomap->flags & IOMAP_F_DIRTY); +} + +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       const struct iomap_ops *ops)  { -	struct address_space *mapping = vmf->vma->vm_file->f_mapping; +	struct vm_area_struct *vma = vmf->vma; +	struct address_space *mapping = vma->vm_file->f_mapping;  	struct inode *inode = mapping->host;  	unsigned long vaddr = vmf->address;  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; -	sector_t sector;  	struct iomap iomap = { 0 };  	unsigned flags = IOMAP_FAULT;  	int error, major = 0; +	bool write = vmf->flags & FAULT_FLAG_WRITE; +	bool sync;  	int vmf_ret = 0;  	void *entry; +	pfn_t pfn;  	trace_dax_pte_fault(inode, vmf, vmf_ret);  	/* @@ -1105,7 +1118,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  		goto out;  	} -	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) +	if (write && !vmf->cow_page)  		flags |= IOMAP_WRITE;  	entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1140,9 +1153,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  		goto error_finish_iomap;  	} -	sector = dax_iomap_sector(&iomap, pos); -  	if (vmf->cow_page) { +		sector_t sector = dax_iomap_sector(&iomap, pos); +  		switch (iomap.type) {  		case IOMAP_HOLE:  		case IOMAP_UNWRITTEN: @@ -1168,22 +1181,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  		goto finish_iomap;  	} +	sync = dax_fault_is_synchronous(flags, vma, &iomap); +  	switch (iomap.type) {  	case IOMAP_MAPPED:  		if (iomap.flags & IOMAP_F_NEW) {  			count_vm_event(PGMAJFAULT); -			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); +			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);  			major = VM_FAULT_MAJOR;  		} -		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, -				sector, PAGE_SIZE, entry, vmf->vma, vmf); +		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); +		if (error < 0) +			goto error_finish_iomap; + +		entry = dax_insert_mapping_entry(mapping, vmf, entry, +						 dax_iomap_sector(&iomap, pos), +						 0, write && !sync); +		if (IS_ERR(entry)) { +			error = PTR_ERR(entry); +			goto error_finish_iomap; +		} + +		/* +		 * If we are doing synchronous page fault and inode needs fsync, +		 * we can insert PTE into page tables only after that happens. +		 * Skip insertion for now and return the pfn so that caller can +		 * insert it after fsync is done. +		 */ +		if (sync) { +			if (WARN_ON_ONCE(!pfnp)) { +				error = -EIO; +				goto error_finish_iomap; +			} +			*pfnp = pfn; +			vmf_ret = VM_FAULT_NEEDDSYNC | major; +			goto finish_iomap; +		} +		trace_dax_insert_mapping(inode, vmf, entry); +		if (write) +			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); +		else +			error = vm_insert_mixed(vma, vaddr, pfn); +  		/* -EBUSY is fine, somebody else faulted on the same PTE */  		if (error == -EBUSY)  			error = 0;  		break;  	case IOMAP_UNWRITTEN:  	case IOMAP_HOLE: -		if (!(vmf->flags & FAULT_FLAG_WRITE)) { +		if (!write) {  			vmf_ret = dax_load_hole(mapping, entry, vmf);  			goto finish_iomap;  		} @@ -1218,53 +1264,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,  }  #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, -		loff_t pos, void *entry) -{ -	struct address_space *mapping = vmf->vma->vm_file->f_mapping; -	const sector_t sector = dax_iomap_sector(iomap, pos); -	struct dax_device *dax_dev = iomap->dax_dev; -	struct block_device *bdev = iomap->bdev; -	struct inode *inode = mapping->host; -	const size_t size = PMD_SIZE; -	void *ret = NULL, *kaddr; -	long length = 0; -	pgoff_t pgoff; -	pfn_t pfn = {}; -	int id; - -	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) -		goto fallback; - -	id = dax_read_lock(); -	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); -	if (length < 0) -		goto unlock_fallback; -	length = PFN_PHYS(length); - -	if (length < size) -		goto unlock_fallback; -	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) -		goto unlock_fallback; -	if (!pfn_t_devmap(pfn)) -		goto unlock_fallback; -	dax_read_unlock(id); - -	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, -			RADIX_DAX_PMD); -	if (IS_ERR(ret)) -		goto fallback; - -	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); -	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, -			pfn, vmf->flags & FAULT_FLAG_WRITE); - -unlock_fallback: -	dax_read_unlock(id); -fallback: -	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); -	return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,  		void *entry) @@ -1283,7 +1287,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,  		goto fallback;  	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, -			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); +			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);  	if (IS_ERR(ret))  		goto fallback; @@ -1305,13 +1309,14 @@ fallback:  	return VM_FAULT_FALLBACK;  } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       const struct iomap_ops *ops)  {  	struct vm_area_struct *vma = vmf->vma;  	struct address_space *mapping = vma->vm_file->f_mapping;  	unsigned long pmd_addr = vmf->address & PMD_MASK;  	bool write = vmf->flags & FAULT_FLAG_WRITE; +	bool sync;  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;  	struct inode *inode = mapping->host;  	int result = VM_FAULT_FALLBACK; @@ -1320,6 +1325,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  	void *entry;  	loff_t pos;  	int error; +	pfn_t pfn;  	/*  	 * Check whether offset isn't beyond end of file now. Caller is @@ -1327,7 +1333,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  	 * this is a reliable test.  	 */  	pgoff = linear_page_index(vma, pmd_addr); -	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; +	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);  	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1351,13 +1357,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  	if ((pmd_addr + PMD_SIZE) > vma->vm_end)  		goto fallback; -	if (pgoff > max_pgoff) { +	if (pgoff >= max_pgoff) {  		result = VM_FAULT_SIGBUS;  		goto out;  	}  	/* If the PMD would extend beyond the file size */ -	if ((pgoff | PG_PMD_COLOUR) > max_pgoff) +	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)  		goto fallback;  	/* @@ -1395,9 +1401,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  	if (iomap.offset + iomap.length < pos + PMD_SIZE)  		goto finish_iomap; +	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); +  	switch (iomap.type) {  	case IOMAP_MAPPED: -		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); +		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); +		if (error < 0) +			goto finish_iomap; + +		entry = dax_insert_mapping_entry(mapping, vmf, entry, +						dax_iomap_sector(&iomap, pos), +						RADIX_DAX_PMD, write && !sync); +		if (IS_ERR(entry)) +			goto finish_iomap; + +		/* +		 * If we are doing synchronous page fault and inode needs fsync, +		 * we can insert PMD into page tables only after that happens. +		 * Skip insertion for now and return the pfn so that caller can +		 * insert it after fsync is done. +		 */ +		if (sync) { +			if (WARN_ON_ONCE(!pfnp)) +				goto finish_iomap; +			*pfnp = pfn; +			result = VM_FAULT_NEEDDSYNC; +			goto finish_iomap; +		} + +		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); +		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, +					    write);  		break;  	case IOMAP_UNWRITTEN:  	case IOMAP_HOLE: @@ -1437,7 +1471,7 @@ out:  	return result;  }  #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       const struct iomap_ops *ops)  {  	return VM_FAULT_FALLBACK; @@ -1447,7 +1481,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,  /**   * dax_iomap_fault - handle a page fault on a DAX file   * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required + * @ops: Iomap ops passed from the file system   *   * When a page fault occurs, filesystems may call this helper in   * their fault handler for DAX files. dax_iomap_fault() assumes the caller @@ -1455,15 +1491,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,   * successfully.   */  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, -		    const struct iomap_ops *ops) +		    pfn_t *pfnp, const struct iomap_ops *ops)  {  	switch (pe_size) {  	case PE_SIZE_PTE: -		return dax_iomap_pte_fault(vmf, ops); +		return dax_iomap_pte_fault(vmf, pfnp, ops);  	case PE_SIZE_PMD: -		return dax_iomap_pmd_fault(vmf, ops); +		return dax_iomap_pmd_fault(vmf, pfnp, ops);  	default:  		return VM_FAULT_FALLBACK;  	}  }  EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file.  It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +				  enum page_entry_size pe_size, +				  pfn_t pfn) +{ +	struct address_space *mapping = vmf->vma->vm_file->f_mapping; +	void *entry, **slot; +	pgoff_t index = vmf->pgoff; +	int vmf_ret, error; + +	spin_lock_irq(&mapping->tree_lock); +	entry = get_unlocked_mapping_entry(mapping, index, &slot); +	/* Did we race with someone splitting entry or so? */ +	if (!entry || +	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || +	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { +		put_unlocked_mapping_entry(mapping, index, entry); +		spin_unlock_irq(&mapping->tree_lock); +		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, +						      VM_FAULT_NOPAGE); +		return VM_FAULT_NOPAGE; +	} +	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); +	entry = lock_slot(mapping, slot); +	spin_unlock_irq(&mapping->tree_lock); +	switch (pe_size) { +	case PE_SIZE_PTE: +		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); +		vmf_ret = dax_fault_return(error); +		break; +#ifdef CONFIG_FS_DAX_PMD +	case PE_SIZE_PMD: +		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, +			pfn, true); +		break; +#endif +	default: +		vmf_ret = VM_FAULT_FALLBACK; +	} +	put_locked_mapping_entry(mapping, index); +	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); +	return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +			  pfn_t pfn) +{ +	int err; +	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; +	size_t len = 0; + +	if (pe_size == PE_SIZE_PTE) +		len = PAGE_SIZE; +	else if (pe_size == PE_SIZE_PMD) +		len = PMD_SIZE; +	else +		WARN_ON_ONCE(1); +	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); +	if (err) +		return VM_FAULT_SIGBUS; +	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); | 
