diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hmm.c | 262 |
1 files changed, 243 insertions, 19 deletions
@@ -10,6 +10,7 @@ */ #include <linux/pagewalk.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/init.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -23,6 +24,7 @@ #include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> +#include <linux/pci-p2pdma.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -39,13 +41,21 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; +enum { + /* These flags are carried from input-to-output */ + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | + HMM_PFN_P2PDMA_BUS, +}; + static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, unsigned long cpu_flags) { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long cpu_flags; pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; + uint64_t new_pfn_flags = 0; if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; - return 0; + goto out; } if (!pte_present(pte)) { @@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; - return 0; + new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + goto out; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); - if (!required_fault) { - *hmm_pfn = 0; - return 0; - } + if (!required_fault) + goto out; if (!non_swap_entry(entry)) goto fault; @@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; - return 0; + new_pfn_flags = HMM_PFN_ERROR; + goto out; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + new_pfn_flags = pte_pfn(pte) | cpu_flags; +out: + *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags; return 0; fault: @@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; @@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range) return ret; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_dma_map_alloc - Allocate HMM map structure + * @dev: device to allocate structure for + * @map: HMM map to allocate + * @nr_entries: number of entries in the map + * @dma_entry_size: size of the DMA entry in the map + * + * Allocate the HMM map structure and all the lists it contains. + * Return 0 on success, -ENOMEM on failure. + */ +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size) +{ + bool dma_need_sync = false; + bool use_iova; + + WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size)); + + /* + * The HMM API violates our normal DMA buffer ownership rules and can't + * transfer buffer ownership. The dma_addressing_limited() check is a + * best approximation to ensure no swiotlb buffering happens. + */ +#ifdef CONFIG_DMA_NEED_SYNC + dma_need_sync = !dev->dma_skip_sync; +#endif /* CONFIG_DMA_NEED_SYNC */ + if (dma_need_sync || dma_addressing_limited(dev)) + return -EOPNOTSUPP; + + map->dma_entry_size = dma_entry_size; + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + return -ENOMEM; + + use_iova = dma_iova_try_alloc(dev, &map->state, 0, + nr_entries * PAGE_SIZE); + if (!use_iova && dma_need_unmap(dev)) { + map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->dma_list) + goto err_dma; + } + return 0; + +err_dma: + kvfree(map->pfn_list); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(hmm_dma_map_alloc); + +/** + * hmm_dma_map_free - iFree HMM map structure + * @dev: device to free structure from + * @map: HMM map containing the various lists and state + * + * Free the HMM map structure and all the lists it contains. + */ +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map) +{ + if (dma_use_iova(&map->state)) + dma_iova_free(dev, &map->state); + kvfree(map->pfn_list); + kvfree(map->dma_list); +} +EXPORT_SYMBOL_GPL(hmm_dma_map_free); + +/** + * hmm_dma_map_pfn - Map a physical HMM page to DMA address + * @dev: Device to map the page for + * @map: HMM map + * @idx: Index into the PFN and dma address arrays + * @p2pdma_state: PCI P2P state. + * + * dma_alloc_iova() allocates IOVA based on the size specified by their use in + * iova->size. Call this function after IOVA allocation to link whole @page + * to get the DMA address. Note that very first call to this function + * will have @offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state) +{ + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + struct page *page = hmm_pfn_to_page(pfns[idx]); + phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); + size_t offset = idx * map->dma_entry_size; + unsigned long attrs = 0; + dma_addr_t dma_addr; + int ret; + + if ((pfns[idx] & HMM_PFN_DMA_MAPPED) && + !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) { + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + */ + if (dma_use_iova(state)) + return state->addr + offset; + + /* + * Without dma_need_unmap, the dma_addrs array is NULL, thus we + * need to regenerate the address below even if there already + * was a mapping. But !dma_need_unmap implies that the + * mapping stateless, so this is fine. + */ + if (dma_need_unmap(dev)) + return dma_addrs[idx]; + + /* Continue to remapping */ + } + + switch (pci_p2pdma_state(p2pdma_state, dev, page)) { + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + pfns[idx] |= HMM_PFN_P2PDMA; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; + return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + default: + return DMA_MAPPING_ERROR; + } + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, paddr, offset, + map->dma_entry_size, DMA_BIDIRECTIONAL, + attrs); + if (ret) + goto error; + + ret = dma_iova_sync(dev, state, offset, map->dma_entry_size); + if (ret) { + dma_iova_unlink(dev, state, offset, map->dma_entry_size, + DMA_BIDIRECTIONAL, attrs); + goto error; + } + + dma_addr = state->addr + offset; + } else { + if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs)) + goto error; + + dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, dma_addr)) + goto error; + + if (dma_need_unmap(dev)) + dma_addrs[idx] = dma_addr; + } + pfns[idx] |= HMM_PFN_DMA_MAPPED; + return dma_addr; +error: + pfns[idx] &= ~HMM_PFN_P2PDMA; + return DMA_MAPPING_ERROR; + +} +EXPORT_SYMBOL_GPL(hmm_dma_map_pfn); + +/** + * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address + * @dev: Device to unmap the page from + * @map: HMM map + * @idx: Index of the PFN to unmap + * + * Returns true if the PFN was mapped and has been unmapped, false otherwise. + */ +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) +{ + const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED; + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + unsigned long attrs = 0; + + if ((pfns[idx] & valid_dma) != valid_dma) + return false; + + if (pfns[idx] & HMM_PFN_P2PDMA_BUS) + ; /* no need to unmap bus address P2P mappings */ + else if (dma_use_iova(state)) { + if (pfns[idx] & HMM_PFN_P2PDMA) + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + dma_iova_unlink(dev, state, idx * map->dma_entry_size, + map->dma_entry_size, DMA_BIDIRECTIONAL, attrs); + } else if (dma_need_unmap(dev)) + dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size, + DMA_BIDIRECTIONAL); + + pfns[idx] &= + ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS); + return true; +} +EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn); |