diff options
Diffstat (limited to 'drivers')
385 files changed, 25519 insertions, 5785 deletions
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 97ee19f2cae0..56107aa00274 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -22,6 +22,7 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/acpi.h> +#include <linux/bitfield.h> #include <linux/io.h> #include <linux/interrupt.h> #include <linux/timer.h> @@ -552,26 +553,25 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, } static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, - int sev, bool sync) + int sev, bool sync) { struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); int flags = sync ? MF_ACTION_REQUIRED : 0; + char error_type[120]; bool queued = false; int sec_sev, i; char *p; - log_arm_hw_error(err); - sec_sev = ghes_severity(gdata->error_severity); + log_arm_hw_error(err, sec_sev); if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) return false; p = (char *)(err + 1); for (i = 0; i < err->err_info_num; i++) { struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; - bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR); + bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR; bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); - const char *error_type = "unknown error"; /* * The field (err_info->error_info & BIT(26)) is fixed to set to @@ -585,12 +585,15 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, continue; } - if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs)) - error_type = cper_proc_error_type_strs[err_info->type]; + cper_bits_to_str(error_type, sizeof(error_type), + FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type), + cper_proc_error_type_strs, + ARRAY_SIZE(cper_proc_error_type_strs)); pr_warn_ratelimited(FW_WARN GHES_PFX - "Unhandled processor error type: %s\n", - error_type); + "Unhandled processor error type 0x%02x: %s%s\n", + err_info->type, error_type, + (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : ""); p += err_info->length; } @@ -895,11 +898,9 @@ static void ghes_do_proc(struct ghes *ghes, arch_apei_report_mem_error(sev, mem_err); queued = ghes_handle_memory_failure(gdata, sev, sync); - } - else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { + } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { ghes_handle_aer(gdata); - } - else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { + } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev, sync); } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 11e4483685c9..77a81627aaef 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -910,12 +910,13 @@ static void hmat_register_target(struct memory_target *target) * Register generic port perf numbers. The nid may not be * initialized and is still NUMA_NO_NODE. */ - mutex_lock(&target_lock); - if (*(u16 *)target->gen_port_device_handle) { - hmat_update_generic_target(target); - target->registered = true; + scoped_guard(mutex, &target_lock) { + if (*(u16 *)target->gen_port_device_handle) { + hmat_update_generic_target(target); + target->registered = true; + return; + } } - mutex_unlock(&target_lock); hmat_hotplug_target(target); } diff --git a/drivers/amba/Kconfig b/drivers/amba/Kconfig index fb6c7e0b4cce..14bb61ff801e 100644 --- a/drivers/amba/Kconfig +++ b/drivers/amba/Kconfig @@ -5,7 +5,7 @@ config ARM_AMBA if ARM_AMBA config TEGRA_AHB - bool + bool "Enable AHB driver for NVIDIA Tegra SoCs" if COMPILE_TEST default y if ARCH_TEGRA help Adds AHB configuration functionality for NVIDIA Tegra SoCs, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index f48fb63d7e85..462217935558 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4216,6 +4216,10 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { /* Apacer models with LPM issues */ { "Apacer AS340*", NULL, ATA_QUIRK_NOLPM }, + /* Silicon Motion models with LPM issues */ + { "MD619HXCLDE3TC", "TCVAID", ATA_QUIRK_NOLPM }, + { "MD619GXCLDE3TC", "TCV35D", ATA_QUIRK_NOLPM }, + /* These specific Samsung models/firmware-revs do not handle LPM well */ { "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_QUIRK_NOLPM }, { "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_QUIRK_NOLPM }, diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 1e2a2c33cdc8..785b6e371abf 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -3191,7 +3191,8 @@ void ata_sff_port_init(struct ata_port *ap) int __init ata_sff_init(void) { - ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM, WQ_MAX_ACTIVE); + ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM | WQ_PERCPU, + WQ_MAX_ACTIVE); if (!ata_sff_wq) return -ENOMEM; diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c index 042f6ad1f7c6..fc762dcc61bf 100644 --- a/drivers/ata/pata_it821x.c +++ b/drivers/ata/pata_it821x.c @@ -75,6 +75,7 @@ #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/string.h> #include <scsi/scsi_host.h> #include <linux/libata.h> @@ -632,9 +633,9 @@ static void it821x_display_disk(struct ata_port *ap, int n, u8 *buf) cbl = ""; if (mode) - snprintf(mbuf, 8, "%5s%d", mtype, mode - 1); + snprintf(mbuf, sizeof(mbuf), "%5s%d", mtype, mode - 1); else - strcpy(mbuf, "PIO"); + strscpy(mbuf, "PIO"); if (buf[52] == 4) ata_port_info(ap, "%d: %-6s %-8s %s %s\n", n, mbuf, types[buf[52]], id, cbl); diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c index cf3810933a27..caefcd8c4b3c 100644 --- a/drivers/ata/pata_pcmcia.c +++ b/drivers/ata/pata_pcmcia.c @@ -344,6 +344,7 @@ static const struct pcmcia_device_id pcmcia_devices[] = { PCMCIA_DEVICE_PROD_ID2("NinjaATA-", 0xebe0bd79), PCMCIA_DEVICE_PROD_ID12("PCMCIA", "CD-ROM", 0x281f1c5d, 0x66536591), PCMCIA_DEVICE_PROD_ID12("PCMCIA", "PnPIDE", 0x281f1c5d, 0x0c694728), + PCMCIA_DEVICE_PROD_ID2("PCMCIA ATA/ATAPI Adapter", 0x888d7b73), PCMCIA_DEVICE_PROD_ID12("SHUTTLE TECHNOLOGY LTD.", "PCCARD-IDE/ATAPI Adapter", 0x4a3f0ba0, 0x322560e1), PCMCIA_DEVICE_PROD_ID12("SEAGATE", "ST1", 0x87c1b330, 0xe1f30883), PCMCIA_DEVICE_PROD_ID12("SAMSUNG", "04/05/06", 0x43d74cb4, 0x6a22777d), diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index e25daf2396d3..082b910ddf0d 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -231,42 +231,6 @@ struct tpm_chip *tpm_default_chip(void) EXPORT_SYMBOL_GPL(tpm_default_chip); /** - * tpm_find_get_ops() - find and reserve a TPM chip - * @chip: a &struct tpm_chip instance, %NULL for the default chip - * - * Finds a TPM chip and reserves its class device and operations. The chip must - * be released with tpm_put_ops() after use. - * This function is for internal use only. It supports existing TPM callers - * by accepting NULL, but those callers should be converted to pass in a chip - * directly. - * - * Return: - * A reserved &struct tpm_chip instance. - * %NULL if a chip is not found. - * %NULL if the chip is not available. - */ -struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip) -{ - int rc; - - if (chip) { - if (!tpm_try_get_ops(chip)) - return chip; - return NULL; - } - - chip = tpm_default_chip(); - if (!chip) - return NULL; - rc = tpm_try_get_ops(chip); - /* release additional reference we got from tpm_default_chip() */ - put_device(&chip->dev); - if (rc) - return NULL; - return chip; -} - -/** * tpm_dev_release() - free chip memory and the device number * @dev: the character device for the TPM chip * @@ -282,7 +246,6 @@ static void tpm_dev_release(struct device *dev) kfree(chip->work_space.context_buf); kfree(chip->work_space.session_buf); - kfree(chip->allocated_banks); #ifdef CONFIG_TCG_TPM2_HMAC kfree(chip->auth); #endif diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c index f2a5e09257dd..f942c0c8e402 100644 --- a/drivers/char/tpm/tpm-dev-common.c +++ b/drivers/char/tpm/tpm-dev-common.c @@ -275,7 +275,8 @@ void tpm_common_release(struct file *file, struct file_priv *priv) int __init tpm_dev_common_init(void) { - tpm_dev_wq = alloc_workqueue("tpm_dev_wq", WQ_MEM_RECLAIM, 0); + tpm_dev_wq = alloc_workqueue("tpm_dev_wq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); return !tpm_dev_wq ? -ENOMEM : 0; } diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index c9f173001d0e..f745a098908b 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -313,10 +313,13 @@ int tpm_is_tpm2(struct tpm_chip *chip) { int rc; - chip = tpm_find_get_ops(chip); if (!chip) return -ENODEV; + rc = tpm_try_get_ops(chip); + if (rc) + return rc; + rc = (chip->flags & TPM_CHIP_FLAG_TPM2) != 0; tpm_put_ops(chip); @@ -338,10 +341,13 @@ int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx, { int rc; - chip = tpm_find_get_ops(chip); if (!chip) return -ENODEV; + rc = tpm_try_get_ops(chip); + if (rc) + return rc; + if (chip->flags & TPM_CHIP_FLAG_TPM2) rc = tpm2_pcr_read(chip, pcr_idx, digest, NULL); else @@ -369,10 +375,13 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, int rc; int i; - chip = tpm_find_get_ops(chip); if (!chip) return -ENODEV; + rc = tpm_try_get_ops(chip); + if (rc) + return rc; + for (i = 0; i < chip->nr_allocated_banks; i++) { if (digests[i].alg_id != chip->allocated_banks[i].alg_id) { rc = -EINVAL; @@ -492,10 +501,13 @@ int tpm_get_random(struct tpm_chip *chip, u8 *out, size_t max) if (!out || max > TPM_MAX_RNG_DATA) return -EINVAL; - chip = tpm_find_get_ops(chip); if (!chip) return -ENODEV; + rc = tpm_try_get_ops(chip); + if (rc) + return rc; + if (chip->flags & TPM_CHIP_FLAG_TPM2) rc = tpm2_get_random(chip, out, max); else diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 2726bd38e5ac..02c07fef41ba 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -267,7 +267,6 @@ static inline void tpm_msleep(unsigned int delay_msec) int tpm_chip_bootstrap(struct tpm_chip *chip); int tpm_chip_start(struct tpm_chip *chip); void tpm_chip_stop(struct tpm_chip *chip); -struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip); struct tpm_chip *tpm_chip_alloc(struct device *dev, const struct tpm_class_ops *ops); diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c index cf64c7385105..b49a790f1bd5 100644 --- a/drivers/char/tpm/tpm1-cmd.c +++ b/drivers/char/tpm/tpm1-cmd.c @@ -799,11 +799,6 @@ int tpm1_pm_suspend(struct tpm_chip *chip, u32 tpm_suspend_pcr) */ int tpm1_get_pcr_allocation(struct tpm_chip *chip) { - chip->allocated_banks = kcalloc(1, sizeof(*chip->allocated_banks), - GFP_KERNEL); - if (!chip->allocated_banks) - return -ENOMEM; - chip->allocated_banks[0].alg_id = TPM_ALG_SHA1; chip->allocated_banks[0].digest_size = hash_digest_size[HASH_ALGO_SHA1]; chip->allocated_banks[0].crypto_id = HASH_ALGO_SHA1; diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 5532e53a2dd3..dd502322f499 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -550,11 +550,9 @@ ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip) nr_possible_banks = be32_to_cpup( (__be32 *)&buf.data[TPM_HEADER_SIZE + 5]); - - chip->allocated_banks = kcalloc(nr_possible_banks, - sizeof(*chip->allocated_banks), - GFP_KERNEL); - if (!chip->allocated_banks) { + if (nr_possible_banks > TPM2_MAX_PCR_BANKS) { + pr_err("tpm: out of bank capacity: %u > %u\n", + nr_possible_banks, TPM2_MAX_PCR_BANKS); rc = -ENOMEM; goto out; } diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index c75a531cfb98..6c25305c256e 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -179,6 +179,7 @@ static int crb_try_pluton_doorbell(struct crb_priv *priv, bool wait_for_complete * * @dev: crb device * @priv: crb private data + * @loc: locality * * Write CRB_CTRL_REQ_GO_IDLE to TPM_CRB_CTRL_REQ * The device should respond within TIMEOUT_C by clearing the bit. @@ -233,6 +234,7 @@ static int crb_go_idle(struct tpm_chip *chip) * * @dev: crb device * @priv: crb private data + * @loc: locality * * Write CRB_CTRL_REQ_CMD_READY to TPM_CRB_CTRL_REQ * and poll till the device acknowledge it by clearing the bit. @@ -412,7 +414,7 @@ static int crb_do_acpi_start(struct tpm_chip *chip) #ifdef CONFIG_ARM64 /* * This is a TPM Command Response Buffer start method that invokes a - * Secure Monitor Call to requrest the firmware to execute or cancel + * Secure Monitor Call to request the firmware to execute or cancel * a TPM 2.0 command. */ static int tpm_crb_smc_start(struct device *dev, unsigned long func_id) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 8954a8660ffc..e2a1769081b1 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -265,8 +265,7 @@ static u8 tpm_tis_status(struct tpm_chip *chip) /* * Dump stack for forensics, as invalid TPM_STS.x could be - * potentially triggered by impaired tpm_try_get_ops() or - * tpm_find_get_ops(). + * potentially triggered by impaired tpm_try_get_ops(). */ dump_stack(); } diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index be25ecbdba69..f8bfff5dd0bd 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3032,11 +3032,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5752,6 +5777,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5890,6 +5916,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c index 667d5e320f50..11728cf32653 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_aer.c +++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c @@ -105,7 +105,6 @@ void adf_dev_restore(struct adf_accel_dev *accel_dev) accel_dev->accel_id); hw_device->reset_device(accel_dev); pci_restore_state(pdev); - pci_save_state(pdev); } } @@ -204,7 +203,6 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev) if (!pdev->is_busmaster) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); res = adf_dev_up(accel_dev, false); if (res && res != -EALREADY) return PCI_ERS_RESULT_DISCONNECT; diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index bd2e282ca93a..77ac940e3013 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -11,25 +11,36 @@ #include "cxlpci.h" #include "cxl.h" -struct cxl_cxims_data { - int nr_maps; - u64 xormaps[] __counted_by(nr_maps); -}; - static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) +#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1) +static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = { + [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4 +}; + +static const int valid_hbiw[] = { 1, 2, 3, 4, 6, 8, 12, 16 }; + +u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw) { - struct cxl_cxims_data *cximsd = cxlrd->platform_data; - int hbiw = cxlrd->cxlsd.nr_targets; + int nr_maps_to_apply = -1; u64 val; int pos; - /* No xormaps for host bridge interleave ways of 1 or 3 */ - if (hbiw == 1 || hbiw == 3) - return addr; + /* + * Strictly validate hbiw since this function is used for testing and + * that nullifies any expectation of trusted parameters from the CXL + * Region Driver. + */ + for (int i = 0; i < ARRAY_SIZE(valid_hbiw); i++) { + if (valid_hbiw[i] == hbiw) { + nr_maps_to_apply = hbiw_to_nr_maps[hbiw]; + break; + } + } + if (nr_maps_to_apply == -1 || nr_maps_to_apply > cximsd->nr_maps) + return ULLONG_MAX; /* * In regions using XOR interleave arithmetic the CXL HPA may not @@ -60,6 +71,14 @@ static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) return addr; } +EXPORT_SYMBOL_FOR_MODULES(cxl_do_xormap_calc, "cxl_translate"); + +static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr) +{ + struct cxl_cxims_data *cximsd = cxlrd->platform_data; + + return cxl_do_xormap_calc(cximsd, addr, cxlrd->cxlsd.nr_targets); +} struct cxl_cxims_context { struct device *dev; @@ -353,7 +372,7 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size); if (rc) - return rc; + return 0; /* * The cache range is expected to be within the CFMWS. @@ -378,21 +397,18 @@ static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) int rc; rc = cxl_acpi_set_cache_size(cxlrd); - if (!rc) - return; - - if (rc != -EOPNOTSUPP) { + if (rc) { /* - * Failing to support extended linear cache region resize does not + * Failing to retrieve extended linear cache region resize does not * prevent the region from functioning. Only causes cxl list showing * incorrect region size. */ dev_warn(cxlrd->cxlsd.cxld.dev.parent, - "Extended linear cache calculation failed rc:%d\n", rc); - } + "Extended linear cache retrieval failed rc:%d\n", rc); - /* Ignoring return code */ - cxlrd->cache_size = 0; + /* Ignoring return code */ + cxlrd->cache_size = 0; + } } DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *, @@ -453,8 +469,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, ig = CXL_DECODER_MIN_GRANULARITY; cxld->interleave_granularity = ig; - cxl_setup_extended_linear_cache(cxlrd); - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { if (ways != 1 && ways != 3) { cxims_ctx = (struct cxl_cxims_context) { @@ -470,18 +484,13 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, return -EINVAL; } } + cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps; + cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps; } - cxlrd->qos_class = cfmws->qtg_id; - - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) { - cxlrd->ops = kzalloc(sizeof(*cxlrd->ops), GFP_KERNEL); - if (!cxlrd->ops) - return -ENOMEM; + cxl_setup_extended_linear_cache(cxlrd); - cxlrd->ops->hpa_to_spa = cxl_apply_xor_maps; - cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps; - } + cxlrd->qos_class = cfmws->qtg_id; rc = cxl_decoder_add(cxld); if (rc) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c4bd6e8a0cf0..7120b5f2e31f 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -826,7 +826,7 @@ static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr, cxl_coordinates_combine(coords, coords, ctx->coord); /* - * Take the min of the calculated bandwdith and the upstream + * Take the min of the calculated bandwidth and the upstream * switch SSLBIS bandwidth if there's a parent switch */ if (!is_root) @@ -949,7 +949,7 @@ static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa) /** * cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region * @cxlr: The region being operated on - * @input_xa: xarray holds cxl_perf_ctx wht calculated bandwidth per ACPI0017 instance + * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance */ static void cxl_region_update_bandwidth(struct cxl_region *cxlr, struct xarray *input_xa) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index d3a094ca01ad..1c5d2022c87a 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -905,6 +905,9 @@ static void cxl_decoder_reset(struct cxl_decoder *cxld) if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) return; + if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + if (port->commit_end == id) cxl_port_commit_reap(cxld); else diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 18825e1505d6..5b023a0178a4 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -71,85 +71,6 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL"); -struct cxl_walk_context { - struct pci_bus *bus; - struct cxl_port *port; - int type; - int error; - int count; -}; - -static int match_add_dports(struct pci_dev *pdev, void *data) -{ - struct cxl_walk_context *ctx = data; - struct cxl_port *port = ctx->port; - int type = pci_pcie_type(pdev); - struct cxl_register_map map; - struct cxl_dport *dport; - u32 lnkcap, port_num; - int rc; - - if (pdev->bus != ctx->bus) - return 0; - if (!pci_is_pcie(pdev)) - return 0; - if (type != ctx->type) - return 0; - if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, - &lnkcap)) - return 0; - - rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); - if (rc) - dev_dbg(&port->dev, "failed to find component registers\n"); - - port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - dport = devm_cxl_add_dport(port, &pdev->dev, port_num, map.resource); - if (IS_ERR(dport)) { - ctx->error = PTR_ERR(dport); - return PTR_ERR(dport); - } - ctx->count++; - - return 0; -} - -/** - * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port - * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated - * - * Returns a positive number of dports enumerated or a negative error - * code. - */ -int devm_cxl_port_enumerate_dports(struct cxl_port *port) -{ - struct pci_bus *bus = cxl_port_to_pci_bus(port); - struct cxl_walk_context ctx; - int type; - - if (!bus) - return -ENXIO; - - if (pci_is_root_bus(bus)) - type = PCI_EXP_TYPE_ROOT_PORT; - else - type = PCI_EXP_TYPE_DOWNSTREAM; - - ctx = (struct cxl_walk_context) { - .port = port, - .bus = bus, - .type = type, - }; - pci_walk_bus(bus, match_add_dports, &ctx); - - if (ctx.count == 0) - return -ENODEV; - if (ctx.error) - return ctx.error; - return ctx.count; -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, "CXL"); - static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); @@ -1217,6 +1138,14 @@ int cxl_gpf_port_setup(struct cxl_dport *dport) return 0; } +struct cxl_walk_context { + struct pci_bus *bus; + struct cxl_port *port; + int type; + int error; + int count; +}; + static int count_dports(struct pci_dev *pdev, void *data) { struct cxl_walk_context *ctx = data; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 8128fd2b5b31..fef3aa0c6680 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -459,7 +459,6 @@ static void cxl_root_decoder_release(struct device *dev) if (atomic_read(&cxlrd->region_id) >= 0) memregion_free(atomic_read(&cxlrd->region_id)); __cxl_decoder_release(&cxlrd->cxlsd.cxld); - kfree(cxlrd->ops); kfree(cxlrd); } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 41b64d871c5a..82d229c8f9bf 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -245,6 +245,9 @@ static void cxl_region_decode_reset(struct cxl_region *cxlr, int count) struct cxl_region_params *p = &cxlr->params; int i; + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return; + /* * Before region teardown attempt to flush, evict any data cached for * this region, or scream loudly about missing arch / platform support @@ -419,6 +422,9 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr, return len; } + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return -EPERM; + rc = queue_reset(cxlr); if (rc) return rc; @@ -461,21 +467,6 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(commit); -static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, - int n) -{ - struct device *dev = kobj_to_dev(kobj); - struct cxl_region *cxlr = to_cxl_region(dev); - - /* - * Support tooling that expects to find a 'uuid' attribute for all - * regions regardless of mode. - */ - if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) - return 0444; - return a->mode; -} - static ssize_t interleave_ways_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -754,6 +745,21 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(size); +static ssize_t extended_linear_cache_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%#llx\n", p->cache_size); +} +static DEVICE_ATTR_RO(extended_linear_cache_size); + static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, &dev_attr_commit.attr, @@ -762,9 +768,34 @@ static struct attribute *cxl_region_attrs[] = { &dev_attr_resource.attr, &dev_attr_size.attr, &dev_attr_mode.attr, + &dev_attr_extended_linear_cache_size.attr, NULL, }; +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + /* + * Support tooling that expects to find a 'uuid' attribute for all + * regions regardless of mode. + */ + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) + return 0444; + + /* + * Don't display extended linear cache attribute if there is no + * extended linear cache. + */ + if (a == &dev_attr_extended_linear_cache_size.attr && + cxlr->params.cache_size == 0) + return 0; + + return a->mode; +} + static const struct attribute_group cxl_region_group = { .attrs = cxl_region_attrs, .is_visible = cxl_region_visible, @@ -838,16 +869,16 @@ static int match_free_decoder(struct device *dev, const void *data) return 1; } -static bool region_res_match_cxl_range(const struct cxl_region_params *p, - const struct range *range) +static bool spa_maps_hpa(const struct cxl_region_params *p, + const struct range *range) { if (!p->res) return false; /* - * If an extended linear cache region then the CXL range is assumed - * to be fronted by the DRAM range in current known implementation. - * This assumption will be made until a variant implementation exists. + * The extended linear cache region is constructed by a 1:1 ratio + * where the SPA maps equal amounts of DRAM and CXL HPA capacity with + * CXL decoders at the high end of the SPA range. */ return p->res->start + p->cache_size == range->start && p->res->end == range->end; @@ -865,7 +896,7 @@ static int match_auto_decoder(struct device *dev, const void *data) cxld = to_cxl_decoder(dev); r = &cxld->hpa_range; - if (region_res_match_cxl_range(p, r)) + if (spa_maps_hpa(p, r)) return 1; return 0; @@ -1059,6 +1090,16 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, return 0; } +static void cxl_region_set_lock(struct cxl_region *cxlr, + struct cxl_decoder *cxld) +{ + if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + + set_bit(CXL_REGION_F_LOCK, &cxlr->flags); + clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); +} + /** * cxl_port_attach_region() - track a region's interest in a port by endpoint * @port: port to add a new region reference 'struct cxl_region_ref' @@ -1170,6 +1211,8 @@ static int cxl_port_attach_region(struct cxl_port *port, } } + cxl_region_set_lock(cxlr, cxld); + rc = cxl_rr_ep_add(cxl_rr, cxled); if (rc) { dev_dbg(&cxlr->dev, @@ -1328,7 +1371,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); - int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos; + int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -1465,7 +1508,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || (iw > 1 && cxld->interleave_granularity != ig) || - !region_res_match_cxl_range(p, &cxld->hpa_range) || + !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, "%s:%s %s expected iw: %d ig: %d %pr\n", @@ -1520,9 +1563,8 @@ add_target: cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id; } - inc = 1; + cxl_rr->nr_targets_set++; out_target_set: - cxl_rr->nr_targets_set += inc; dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n", dev_name(port->uport_dev), dev_name(&port->dev), cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev), @@ -2439,6 +2481,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i dev->bus = &cxl_bus_type; dev->type = &cxl_region_type; cxlr->id = id; + cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld); return cxlr; } @@ -2924,38 +2967,119 @@ static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) return false; } -static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd) +#define CXL_POS_ZERO 0 +/** + * cxl_validate_translation_params + * @eiw: encoded interleave ways + * @eig: encoded interleave granularity + * @pos: position in interleave + * + * Callers pass CXL_POS_ZERO when no position parameter needs validating. + * + * Returns: 0 on success, -EINVAL on first invalid parameter + */ +int cxl_validate_translation_params(u8 eiw, u16 eig, int pos) { - return cxlrd->ops && cxlrd->ops->hpa_to_spa; + int ways, gran; + + if (eiw_to_ways(eiw, &ways)) { + pr_debug("%s: invalid eiw=%u\n", __func__, eiw); + return -EINVAL; + } + if (eig_to_granularity(eig, &gran)) { + pr_debug("%s: invalid eig=%u\n", __func__, eig); + return -EINVAL; + } + if (pos < 0 || pos >= ways) { + pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos, + ways); + return -EINVAL; + } + + return 0; } +EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate"); -static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd) +u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig) { - return cxlrd->ops && cxlrd->ops->spa_to_hpa; + u64 dpa_offset, bits_lower, bits_upper, temp; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) + return ULLONG_MAX; + + /* + * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 + * Lower bits [IG+7:0] pass through unchanged + * (eiw < 8) + * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) + * Clear the position bits to isolate upper section, then + * reverse the left shift by eiw that occurred during DPA->HPA + * (eiw >= 8) + * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 + * Extract upper bits from the correct bit range and divide by 3 + * to recover the original DPA upper bits + */ + bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); + if (eiw < 8) { + temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0); + dpa_offset = temp >> eiw; + } else { + bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); + dpa_offset = bits_upper << (eig + 8); + } + dpa_offset |= bits_lower; + + return dpa_offset; } +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate"); -u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa) +int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); - u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = NULL; - u16 eig = 0; - u8 eiw = 0; - int pos; + unsigned int ways = 0; + u64 shifted, rem; + int pos, ret; - for (int i = 0; i < p->nr_targets; i++) { - cxled = p->targets[i]; - if (cxlmd == cxled_to_memdev(cxled)) - break; + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) + return ret; + + if (!eiw) + /* position is 0 if no interleaving */ + return 0; + + /* + * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 + * eiw < 8 + * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. + * Per spec "remove IW bits starting with bit position IG+8" + * eiw >= 8 + * Position is not explicitly stored in HPA_OFFSET bits. It is + * derived from the modulo operation of the upper bits using + * the total number of interleave ways. + */ + if (eiw < 8) { + pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); + } else { + shifted = hpa_offset >> (eig + 8); + eiw_to_ways(eiw, &ways); + div64_u64_rem(shifted, ways, &rem); + pos = rem; } - if (!cxled || cxlmd != cxled_to_memdev(cxled)) - return ULLONG_MAX; - pos = cxled->pos; - ways_to_eiw(p->interleave_ways, &eiw); - granularity_to_eig(p->interleave_granularity, &eig); + return pos; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate"); + +u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig) +{ + u64 mask_upper, hpa_offset, bits_upper; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, pos); + if (ret) + return ULLONG_MAX; /* * The device position in the region interleave set was removed @@ -2967,9 +3091,6 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, * 8.2.4.19.13 Implementation Note: Device Decode Logic */ - /* Remove the dpa base */ - dpa_offset = dpa - cxl_dpa_resource_start(cxled); - mask_upper = GENMASK_ULL(51, eig + 8); if (eiw < 8) { @@ -2984,12 +3105,43 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, /* The lower bits remain unchanged */ hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); + return hpa_offset; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); + +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled = NULL; + u64 dpa_offset, hpa_offset, hpa; + u16 eig = 0; + u8 eiw = 0; + int pos; + + for (int i = 0; i < p->nr_targets; i++) { + if (cxlmd == cxled_to_memdev(p->targets[i])) { + cxled = p->targets[i]; + break; + } + } + if (!cxled) + return ULLONG_MAX; + + pos = cxled->pos; + ways_to_eiw(p->interleave_ways, &eiw); + granularity_to_eig(p->interleave_granularity, &eig); + + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); + /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start + p->cache_size; /* Root decoder translation overrides typical modulo decode */ - if (has_hpa_to_spa(cxlrd)) - hpa = cxlrd->ops->hpa_to_spa(cxlrd, hpa); + if (cxlrd->ops.hpa_to_spa) + hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa); if (!cxl_resource_contains_addr(p->res, hpa)) { dev_dbg(&cxlr->dev, @@ -2998,7 +3150,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, } /* Simple chunk check, by pos & gran, only applies to modulo decodes */ - if (!has_hpa_to_spa(cxlrd) && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) + if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) return ULLONG_MAX; return hpa; @@ -3016,8 +3168,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_endpoint_decoder *cxled; u64 hpa, hpa_offset, dpa_offset; - u64 bits_upper, bits_lower; - u64 shifted, rem, temp; u16 eig = 0; u8 eiw = 0; int pos; @@ -3033,56 +3183,21 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, * If the root decoder has SPA to CXL HPA callback, use it. Otherwise * CXL HPA is assumed to equal SPA. */ - if (has_spa_to_hpa(cxlrd)) { - hpa = cxlrd->ops->spa_to_hpa(cxlrd, p->res->start + offset); + if (cxlrd->ops.spa_to_hpa) { + hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); hpa_offset = hpa - p->res->start; } else { hpa_offset = offset; } - /* - * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 - * eiw < 8 - * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. - * Per spec "remove IW bits starting with bit position IG+8" - * eiw >= 8 - * Position is not explicitly stored in HPA_OFFSET bits. It is - * derived from the modulo operation of the upper bits using - * the total number of interleave ways. - */ - if (eiw < 8) { - pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); - } else { - shifted = hpa_offset >> (eig + 8); - div64_u64_rem(shifted, p->interleave_ways, &rem); - pos = rem; - } + + pos = cxl_calculate_position(hpa_offset, eiw, eig); if (pos < 0 || pos >= p->nr_targets) { dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", pos, p->nr_targets); return -ENXIO; } - /* - * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 - * Lower bits [IG+7:0] pass through unchanged - * (eiw < 8) - * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) - * Clear the position bits to isolate upper section, then - * reverse the left shift by eiw that occurred during DPA->HPA - * (eiw >= 8) - * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 - * Extract upper bits from the correct bit range and divide by 3 - * to recover the original DPA upper bits - */ - bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); - if (eiw < 8) { - temp = hpa_offset &= ~((u64)GENMASK(eig + eiw + 8 - 1, 0)); - dpa_offset = temp >> eiw; - } else { - bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); - dpa_offset = bits_upper << (eig + 8); - } - dpa_offset |= bits_lower; + dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig); /* Look-up and return the result: a memdev and a DPA */ for (int i = 0; i < p->nr_targets; i++) { @@ -3398,7 +3513,7 @@ static int match_region_by_range(struct device *dev, const void *data) p = &cxlr->params; guard(rwsem_read)(&cxl_rwsem.region); - return region_res_match_cxl_range(p, r); + return spa_maps_hpa(p, r); } static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, @@ -3479,6 +3594,10 @@ static int __construct_region(struct cxl_region *cxlr, "Extended linear cache calculation failed rc:%d\n", rc); } + rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group); + if (rc) + return rc; + rc = insert_resource(cxlrd->res, res); if (rc) { /* diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 231ddccf8977..ba17fa86d249 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -451,7 +451,7 @@ struct cxl_root_decoder { void *platform_data; struct mutex range_lock; int qos_class; - struct cxl_rd_ops *ops; + struct cxl_rd_ops ops; struct cxl_switch_decoder cxlsd; }; @@ -517,6 +517,14 @@ enum cxl_partition_mode { */ #define CXL_REGION_F_NEEDS_RESET 1 +/* + * Indicate whether this region is locked due to 1 or more decoders that have + * been locked. The approach of all or nothing is taken with regard to the + * locked attribute. CXL_REGION_F_NEEDS_RESET should not be set if this flag is + * set. + */ +#define CXL_REGION_F_LOCK 2 + /** * struct cxl_region - CXL region * @dev: This region's device @@ -738,6 +746,25 @@ static inline bool is_cxl_root(struct cxl_port *port) return port->uport_dev == port->dev.parent; } +/* Address translation functions exported to cxl_translate test module only */ +int cxl_validate_translation_params(u8 eiw, u16 eig, int pos); +u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig); +u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig); +int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig); +struct cxl_cxims_data { + int nr_maps; + u64 xormaps[] __counted_by(nr_maps); +}; + +#if IS_ENABLED(CONFIG_CXL_ACPI) +u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw); +#else +static inline u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw) +{ + return ULLONG_MAX; +} +#endif + int cxl_num_decoders_committed(struct cxl_port *port); bool is_cxl_port(const struct device *dev); struct cxl_port *to_cxl_port(const struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 7ae621e618e7..1d526bea8431 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -127,7 +127,6 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } -int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); void cxl_cor_error_detected(struct pci_dev *pdev); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index bd100ac31672..0be4e508affe 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -136,7 +136,7 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id) if (opcode == CXL_MBOX_OP_SANITIZE) { mutex_lock(&cxl_mbox->mbox_mutex); if (mds->security.sanitize_node) - mod_delayed_work(system_wq, &mds->security.poll_dwork, 0); + mod_delayed_work(system_percpu_wq, &mds->security.poll_dwork, 0); mutex_unlock(&cxl_mbox->mbox_mutex); } else { /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */ diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c..2008fb7481b3 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-fence-unwrap.o dma-resv.o + dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o obj-$(CONFIG_DMABUF_HEAPS) += heaps/ obj-$(CONFIG_SYNC_FILE) += sync_file.o diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c new file mode 100644 index 000000000000..b7352e609fbd --- /dev/null +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA BUF Mapping Helpers + * + */ +#include <linux/dma-buf-mapping.h> +#include <linux/dma-resv.h> + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(size_t, length, UINT_MAX); + length -= len; + /* + * DMABUF abuses scatterlist to create a scatterlist + * that does not have any CPU list, only the DMA list. + * Always set the page related values to NULL to ensure + * importers can't use it. The phys_addr based DMA API + * does not require the CPU list for mapping or unmapping. + */ + sg_set_page(sgl, NULL, 0, 0); + sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct dma_iova_state *state, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size) +{ + unsigned int nents = 0; + size_t i; + + if (!state || !dma_use_iova(state)) { + for (i = 0; i < nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + } else { + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(size, UINT_MAX); + } + + return nents; +} + +/** + * struct dma_buf_dma - holds DMA mapping information + * @sgt: Scatter-gather table + * @state: DMA IOVA state relevant in IOMMU-based DMA + * @size: Total size of DMA transfer + */ +struct dma_buf_dma { + struct sg_table sgt; + struct dma_iova_state *state; + size_t size; +}; + +/** + * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment + * from arrays of physical vectors. This funciton is intended for MMIO memory + * only. + * @attach: [in] attachment whose scatterlist is to be returned + * @provider: [in] p2pdma provider + * @phys_vec: [in] array of physical vectors + * @nr_ranges: [in] number of entries in phys_vec array + * @size: [in] total size of phys_vec + * @dir: [in] direction of DMA transfer + * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. May return -EINTR if it is interrupted by a signal. + * + * On success, the DMA addresses and lengths in the returned scatterlist are + * PAGE_SIZE aligned. + * + * A mapping must be unmapped by using dma_buf_free_sgt(). + * + * NOTE: This function is intended for exporters. If direct traffic routing is + * mandatory exporter should call routing pci_p2pdma_map_type() before calling + * this function. + */ +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir) +{ + unsigned int nents, mapped_len = 0; + struct dma_buf_dma *dma; + struct scatterlist *sgl; + dma_addr_t addr; + size_t i; + int ret; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (WARN_ON(!attach || !attach->dmabuf || !provider)) + /* This function is supposed to work on MMIO memory only */ + return ERR_PTR(-EINVAL); + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) + return ERR_PTR(-ENOMEM); + + switch (pci_p2pdma_map_type(provider, attach->dev)) { + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + */ + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL); + if (!dma->state) { + ret = -ENOMEM; + goto err_free_dma; + } + + dma_iova_try_alloc(attach->dev, dma->state, 0, size); + break; + default: + ret = -EINVAL; + goto err_free_dma; + } + + nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size); + ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_free_state; + + sgl = dma->sgt.sgl; + + for (i = 0; i < nr_ranges; i++) { + if (!dma->state) { + addr = pci_p2pdma_bus_addr_map(provider, + phys_vec[i].paddr); + } else if (dma_use_iova(dma->state)) { + ret = dma_iova_link(attach->dev, dma->state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attach->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + ret = dma_mapping_error(attach->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!dma->state || !dma_use_iova(dma->state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (dma->state && dma_use_iova(dma->state)) { + WARN_ON_ONCE(mapped_len != size); + ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + + sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr); + } + + dma->size = size; + + /* + * No CPU list included â set orig_nents = 0 so others can detect + * this via SG table (use nents only). + */ + dma->sgt.orig_nents = 0; + + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return &dma->sgt; + +err_unmap_dma: + if (!i || !dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, mapped_len, dir, + DMA_ATTR_MMIO); + } else { + for_each_sgtable_dma_sg(&dma->sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + sg_free_table(&dma->sgt); +err_free_state: + kfree(dma->state); +err_free_dma: + kfree(dma); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF"); + +/** + * dma_buf_free_sgt- unmaps the buffer + * @attach: [in] attachment to unmap buffer from + * @sgt: [in] scatterlist info of the buffer to unmap + * @dir: [in] direction of DMA transfer + * + * This unmaps a DMA mapping for @attached obtained + * by dma_buf_phys_vec_to_sgt(). + */ +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt); + int i; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (!dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, dma->size, dir, + DMA_ATTR_MMIO); + } else { + struct scatterlist *sgl; + + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + + sg_free_table(sgt); + kfree(dma->state); + kfree(dma); + +} +EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF"); diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c index 02f68b328511..227398673b73 100644 --- a/drivers/dma/ioat/init.c +++ b/drivers/dma/ioat/init.c @@ -1286,7 +1286,6 @@ static pci_ers_result_t ioat_pcie_error_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_wake_from_d3(pdev, false); } diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c index f0a63d09d3c4..76542a53e202 100644 --- a/drivers/firmware/efi/cper-arm.c +++ b/drivers/firmware/efi/cper-arm.c @@ -93,15 +93,11 @@ static void cper_print_arm_err_info(const char *pfx, u32 type, bool proc_context_corrupt, corrected, precise_pc, restartable_pc; bool time_out, access_mode; - /* If the type is unknown, bail. */ - if (type > CPER_ARM_MAX_TYPE) - return; - /* * Vendor type errors have error information values that are vendor * specific. */ - if (type == CPER_ARM_VENDOR_ERROR) + if (type & CPER_ARM_VENDOR_ERROR) return; if (error_info & CPER_ARM_ERR_VALID_TRANSACTION_TYPE) { @@ -116,43 +112,38 @@ static void cper_print_arm_err_info(const char *pfx, u32 type, if (error_info & CPER_ARM_ERR_VALID_OPERATION_TYPE) { op_type = ((error_info >> CPER_ARM_ERR_OPERATION_SHIFT) & CPER_ARM_ERR_OPERATION_MASK); - switch (type) { - case CPER_ARM_CACHE_ERROR: + if (type & CPER_ARM_CACHE_ERROR) { if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) { - printk("%soperation type: %s\n", pfx, + printk("%scache error, operation type: %s\n", pfx, arm_cache_err_op_strs[op_type]); } - break; - case CPER_ARM_TLB_ERROR: + } + if (type & CPER_ARM_TLB_ERROR) { if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) { - printk("%soperation type: %s\n", pfx, + printk("%sTLB error, operation type: %s\n", pfx, arm_tlb_err_op_strs[op_type]); } - break; - case CPER_ARM_BUS_ERROR: + } + if (type & CPER_ARM_BUS_ERROR) { if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) { - printk("%soperation type: %s\n", pfx, + printk("%sbus error, operation type: %s\n", pfx, arm_bus_err_op_strs[op_type]); } - break; } } if (error_info & CPER_ARM_ERR_VALID_LEVEL) { level = ((error_info >> CPER_ARM_ERR_LEVEL_SHIFT) & CPER_ARM_ERR_LEVEL_MASK); - switch (type) { - case CPER_ARM_CACHE_ERROR: + if (type & CPER_ARM_CACHE_ERROR) printk("%scache level: %d\n", pfx, level); - break; - case CPER_ARM_TLB_ERROR: + + if (type & CPER_ARM_TLB_ERROR) printk("%sTLB level: %d\n", pfx, level); - break; - case CPER_ARM_BUS_ERROR: + + if (type & CPER_ARM_BUS_ERROR) printk("%saffinity level at which the bus error occurred: %d\n", pfx, level); - break; - } } if (error_info & CPER_ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) { @@ -240,7 +231,8 @@ void cper_print_proc_arm(const char *pfx, int i, len, max_ctx_type; struct cper_arm_err_info *err_info; struct cper_arm_ctx_info *ctx_info; - char newpfx[64], infopfx[64]; + char newpfx[64], infopfx[ARRAY_SIZE(newpfx) + 1]; + char error_type[120]; printk("%sMIDR: 0x%016llx\n", pfx, proc->midr); @@ -289,9 +281,15 @@ void cper_print_proc_arm(const char *pfx, newpfx); } - printk("%serror_type: %d, %s\n", newpfx, err_info->type, - err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ? - cper_proc_error_type_strs[err_info->type] : "unknown"); + cper_bits_to_str(error_type, sizeof(error_type), + FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type), + cper_proc_error_type_strs, + ARRAY_SIZE(cper_proc_error_type_strs)); + + printk("%serror_type: 0x%02x: %s%s\n", newpfx, err_info->type, + error_type, + (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : ""); + if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) { printk("%serror_info: 0x%016llx\n", newpfx, err_info->error_info); diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 928409199a1a..0232bd040f61 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -12,6 +12,7 @@ * Specification version 2.4. */ +#include <linux/bitmap.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/time.h> @@ -69,7 +70,7 @@ const char *cper_severity_str(unsigned int severity) } EXPORT_SYMBOL_GPL(cper_severity_str); -/* +/** * cper_print_bits - print strings for set bits * @pfx: prefix for each line, including log level and prefix string * @bits: bit mask @@ -106,6 +107,65 @@ void cper_print_bits(const char *pfx, unsigned int bits, printk("%s\n", buf); } +/** + * cper_bits_to_str - return a string for set bits + * @buf: buffer to store the output string + * @buf_size: size of the output string buffer + * @bits: bit mask + * @strs: string array, indexed by bit position + * @strs_size: size of the string array: @strs + * + * Add to @buf the bitmask in hexadecimal. Then, for each set bit in @bits, + * add the corresponding string describing the bit in @strs to @buf. + * + * A typical example is:: + * + * const char * const bits[] = { + * "bit 3 name", + * "bit 4 name", + * "bit 5 name", + * }; + * char str[120]; + * unsigned int bitmask = BIT(3) | BIT(5); + * #define MASK GENMASK(5,3) + * + * cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask), + * bits, ARRAY_SIZE(bits)); + * + * The above code fills the string ``str`` with ``bit 3 name|bit 5 name``. + * + * Return: number of bytes stored or an error code if lower than zero. + */ +int cper_bits_to_str(char *buf, int buf_size, unsigned long bits, + const char * const strs[], unsigned int strs_size) +{ + int len = buf_size; + char *str = buf; + int i, size; + + *buf = '\0'; + + for_each_set_bit(i, &bits, strs_size) { + if (!(bits & BIT_ULL(i))) + continue; + + if (*buf && len > 0) { + *str = '|'; + len--; + str++; + } + + size = strscpy(str, strs[i], len); + if (size < 0) + return size; + + len -= size; + str += size; + } + return len - buf_size; +} +EXPORT_SYMBOL_GPL(cper_bits_to_str); + static const char * const proc_type_strs[] = { "IA32/X64", "IA64", diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 874f63b4a383..9cb814c5ba1b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void) { struct screen_info *si, tmp = {}; - if (efi_setup_gop(&tmp) != EFI_SUCCESS) + if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS) return NULL; si = alloc_screen_info(); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index f5ba032863a9..b2fb0c3fa721 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -34,6 +34,9 @@ #define EFI_ALLOC_LIMIT ULONG_MAX #endif +struct edid_info; +struct screen_info; + extern bool efi_no5lvl; extern bool efi_nochunk; extern bool efi_nokaslr; @@ -578,6 +581,32 @@ union efi_graphics_output_protocol { } mixed_mode; }; +typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t; + +union efi_edid_discovered_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + +typedef union efi_edid_active_protocol efi_edid_active_protocol_t; + +union efi_edid_active_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + typedef union { struct { u32 revision; @@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline); void efi_parse_option_graphics(char *option); -efi_status_t efi_setup_gop(struct screen_info *si); +efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid); efi_status_t handle_cmdline_files(efi_loaded_image_t *image, const efi_char16_t *optstr, diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 3785fb4986b4..72d74436a7a4 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <asm/efi.h> #include <asm/setup.h> +#include <video/edid.h> #include "efistub.h" @@ -367,24 +368,31 @@ static void find_bits(u32 mask, u8 *pos, u8 *size) *size = __fls(mask) - *pos + 1; } -static void -setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, - efi_pixel_bitmask_t pixel_info, int pixel_format) +static void setup_screen_info(struct screen_info *si, const efi_graphics_output_protocol_t *gop) { - if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, - &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, - &si->green_pos, &si->green_size); - find_bits(pixel_info.blue_mask, - &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, - &si->rsvd_pos, &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + const efi_graphics_output_protocol_mode_t *mode = efi_table_attr(gop, mode); + const efi_graphics_output_mode_info_t *info = efi_table_attr(mode, info); + + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = info->horizontal_resolution; + si->lfb_height = info->vertical_resolution; + + efi_set_u64_split(efi_table_attr(mode, frame_buffer_base), + &si->lfb_base, &si->ext_lfb_base); + if (si->ext_lfb_base) + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->pages = 1; + + if (info->pixel_format == PIXEL_BIT_MASK) { + find_bits(info->pixel_information.red_mask, &si->red_pos, &si->red_size); + find_bits(info->pixel_information.green_mask, &si->green_pos, &si->green_size); + find_bits(info->pixel_information.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(info->pixel_information.reserved_mask, &si->rsvd_pos, &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + si->blue_size + si->rsvd_size; + si->lfb_linelength = (info->pixels_per_scan_line * si->lfb_depth) / 8; } else { - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + if (info->pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { si->red_pos = 0; si->blue_pos = 16; } else /* PIXEL_BGR_RESERVED_8BIT_PER_COLOR */ { @@ -394,20 +402,33 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, si->green_pos = 8; si->rsvd_pos = 24; - si->red_size = si->green_size = - si->blue_size = si->rsvd_size = 8; - + si->red_size = 8; + si->green_size = 8; + si->blue_size = 8; + si->rsvd_size = 8; si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; + si->lfb_linelength = info->pixels_per_scan_line * 4; } + + si->lfb_size = si->lfb_linelength * si->lfb_height; + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; } -static efi_graphics_output_protocol_t *find_gop(unsigned long num, - const efi_handle_t handles[]) +static void setup_edid_info(struct edid_info *edid, u32 gop_size_of_edid, u8 *gop_edid) +{ + if (!gop_edid || gop_size_of_edid < 128) + memset(edid->dummy, 0, sizeof(edid->dummy)); + else + memcpy(edid->dummy, gop_edid, min(gop_size_of_edid, sizeof(edid->dummy))); +} + +static efi_handle_t find_handle_with_primary_gop(unsigned long num, const efi_handle_t handles[], + efi_graphics_output_protocol_t **found_gop) { efi_graphics_output_protocol_t *first_gop; - efi_handle_t h; + efi_handle_t h, first_gop_handle; + first_gop_handle = NULL; first_gop = NULL; for_each_efi_handle(h, handles, num) { @@ -442,21 +463,25 @@ static efi_graphics_output_protocol_t *find_gop(unsigned long num, */ status = efi_bs_call(handle_protocol, h, &EFI_CONSOLE_OUT_DEVICE_GUID, &dummy); - if (status == EFI_SUCCESS) - return gop; - - if (!first_gop) + if (status == EFI_SUCCESS) { + if (found_gop) + *found_gop = gop; + return h; + } else if (!first_gop_handle) { + first_gop_handle = h; first_gop = gop; + } } - return first_gop; + if (found_gop) + *found_gop = first_gop; + return first_gop_handle; } -efi_status_t efi_setup_gop(struct screen_info *si) +efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid) { efi_handle_t *handles __free(efi_pool) = NULL; - efi_graphics_output_protocol_mode_t *mode; - efi_graphics_output_mode_info_t *info; + efi_handle_t handle; efi_graphics_output_protocol_t *gop; efi_status_t status; unsigned long num; @@ -467,35 +492,41 @@ efi_status_t efi_setup_gop(struct screen_info *si) if (status != EFI_SUCCESS) return status; - gop = find_gop(num, handles); - if (!gop) + handle = find_handle_with_primary_gop(num, handles, &gop); + if (!handle) return EFI_NOT_FOUND; /* Change mode if requested */ set_mode(gop); /* EFI framebuffer */ - mode = efi_table_attr(gop, mode); - info = efi_table_attr(mode, info); - - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = info->horizontal_resolution; - si->lfb_height = info->vertical_resolution; - - efi_set_u64_split(efi_table_attr(mode, frame_buffer_base), - &si->lfb_base, &si->ext_lfb_base); - if (si->ext_lfb_base) - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - - si->pages = 1; - - setup_pixel_info(si, info->pixels_per_scan_line, - info->pixel_information, info->pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; + if (si) + setup_screen_info(si, gop); + + /* Display EDID for primary GOP */ + if (edid) { + efi_edid_discovered_protocol_t *discovered_edid; + efi_edid_active_protocol_t *active_edid; + u32 gop_size_of_edid = 0; + u8 *gop_edid = NULL; + + status = efi_bs_call(handle_protocol, handle, &EFI_EDID_ACTIVE_PROTOCOL_GUID, + (void **)&active_edid); + if (status == EFI_SUCCESS) { + gop_size_of_edid = active_edid->size_of_edid; + gop_edid = active_edid->edid; + } else { + status = efi_bs_call(handle_protocol, handle, + &EFI_EDID_DISCOVERED_PROTOCOL_GUID, + (void **)&discovered_edid); + if (status == EFI_SUCCESS) { + gop_size_of_edid = discovered_edid->size_of_edid; + gop_edid = discovered_edid->edid; + } + } - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; + setup_edid_info(edid, gop_size_of_edid, gop_edid); + } return EFI_SUCCESS; } diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 761121a77f9e..cef32e2c82d8 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -203,6 +203,104 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } +struct smbios_entry_point { + u8 anchor[4]; + u8 ep_checksum; + u8 ep_length; + u8 major_version; + u8 minor_version; + u16 max_size_entry; + u8 ep_rev; + u8 reserved[5]; + + struct __packed { + u8 anchor[5]; + u8 checksum; + u16 st_length; + u32 st_address; + u16 number_of_entries; + u8 bcd_rev; + } intm; +}; + +static bool verify_ep_checksum(const void *ptr, int length) +{ + u8 sum = 0; + + for (int i = 0; i < length; i++) + sum += ((u8 *)ptr)[i]; + + return sum == 0; +} + +static bool verify_ep_integrity(const struct smbios_entry_point *ep) +{ + if (memcmp(ep->anchor, "_SM_", sizeof(ep->anchor)) != 0) + return false; + + if (memcmp(ep->intm.anchor, "_DMI_", sizeof(ep->intm.anchor)) != 0) + return false; + + if (!verify_ep_checksum(ep, ep->ep_length) || + !verify_ep_checksum(&ep->intm, sizeof(ep->intm))) + return false; + + return true; +} + +static const struct efi_smbios_record *search_record(void *table, u32 length, + u8 type) +{ + const u8 *p, *end; + + p = (u8 *)table; + end = p + length; + + while (p + sizeof(struct efi_smbios_record) < end) { + const struct efi_smbios_record *hdr = + (struct efi_smbios_record *)p; + const u8 *next; + + if (hdr->type == type) + return hdr; + + /* Type 127 = End-of-Table */ + if (hdr->type == 0x7F) + return NULL; + + /* Jumping to the unformed section */ + next = p + hdr->length; + + /* Unformed section ends with 0000h */ + while ((next[0] != 0 || next[1] != 0) && next + 1 < end) + next++; + + next += 2; + p = next; + } + + return NULL; +} + +static const struct efi_smbios_record *get_table_record(u8 type) +{ + const struct smbios_entry_point *ep; + + /* + * Locate the legacy 32-bit SMBIOS entrypoint in memory, and parse it + * directly. Needed by some Macs that do not implement the EFI protocol. + */ + ep = get_efi_config_table(SMBIOS_TABLE_GUID); + if (!ep) + return NULL; + + if (!verify_ep_integrity(ep)) + return NULL; + + return search_record((void *)(unsigned long)ep->intm.st_address, + ep->intm.st_length, type); +} + static bool apple_match_product_name(void) { static const char type1_product_matches[][15] = { @@ -218,7 +316,8 @@ static bool apple_match_product_name(void) const struct efi_smbios_type1_record *record; const u8 *product; - record = (struct efi_smbios_type1_record *)efi_get_smbios_record(1); + record = (struct efi_smbios_type1_record *) + (efi_get_smbios_record(1) ?: get_table_record(1)); if (!record) return false; @@ -388,8 +487,9 @@ static void setup_quirks(struct boot_params *boot_params) static void setup_graphics(struct boot_params *boot_params) { struct screen_info *si = memset(&boot_params->screen_info, 0, sizeof(*si)); + struct edid_info *edid = memset(&boot_params->edid_info, 0, sizeof(*edid)); - efi_setup_gop(si); + efi_setup_graphics(si, edid); } static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status) diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index c38b1a335590..e727cc5909cb 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR; * Reserve the memory associated with the Memory Attributes configuration * table, if it exists. */ -int __init efi_memattr_init(void) +void __init efi_memattr_init(void) { efi_memory_attributes_table_t *tbl; unsigned long size; if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR) - return 0; + return; tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl)); if (!tbl) { pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", efi_mem_attr_table); - return -ENOMEM; + return; } if (tbl->version > 2) { @@ -61,7 +61,6 @@ int __init efi_memattr_init(void) unmap: early_memunmap(tbl, sizeof(*tbl)); - return 0; } /* diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c index fa71cd898120..4a2588358be2 100644 --- a/drivers/firmware/efi/riscv-runtime.c +++ b/drivers/firmware/efi/riscv-runtime.c @@ -36,20 +36,12 @@ static bool __init efi_virtmap_init(void) init_new_context(NULL, &efi_mm); for_each_efi_memory_desc(md) { - phys_addr_t phys = md->phys_addr; - int ret; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == U64_MAX) return false; - ret = efi_create_mapping(&efi_mm, md); - if (ret) { - pr_warn(" EFI remap %pa: failed to create mapping (%d)\n", - &phys, ret); - return false; - } + efi_create_mapping(&efi_mm, md); } if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) diff --git a/drivers/firmware/efi/stmm/mm_communication.h b/drivers/firmware/efi/stmm/mm_communication.h index 52a1f32cd1eb..06e7663f96dc 100644 --- a/drivers/firmware/efi/stmm/mm_communication.h +++ b/drivers/firmware/efi/stmm/mm_communication.h @@ -32,7 +32,7 @@ /** * struct efi_mm_communicate_header - Header used for SMM variable communication - + * * @header_guid: header use for disambiguation of content * @message_len: length of the message. Does not include the size of the * header @@ -111,7 +111,7 @@ struct efi_mm_communicate_header { /** * struct smm_variable_communicate_header - Used for SMM variable communication - + * * @function: function to call in Smm. * @ret_status: return status * @data: payload @@ -128,7 +128,7 @@ struct smm_variable_communicate_header { /** * struct smm_variable_access - Used to communicate with StMM by * SetVariable and GetVariable. - + * * @guid: vendor GUID * @data_size: size of EFI variable data * @name_size: size of EFI name diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a1817b4b5173..58c3ffe707d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1678,9 +1678,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; + int max_size, r; unsigned int i; u16 cmd; - int r; if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) return 0; @@ -1726,30 +1726,28 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) return 0; /* Limit the BAR size to what is available */ - rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, - rbar_size); + max_size = pci_rebar_get_max_size(adev->pdev, 0); + if (max_size < 0) + return 0; + rbar_size = min(max_size, rbar_size); /* Disable memory decoding while we change the BAR addresses and size */ pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); pci_write_config_word(adev->pdev, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY); - /* Free the VRAM and doorbell BAR, we most likely need to move both. */ + /* Tear down doorbell as resizing will release BARs */ amdgpu_doorbell_fini(adev); - if (adev->asic_type >= CHIP_BONAIRE) - pci_release_resource(adev->pdev, 2); - pci_release_resource(adev->pdev, 0); - - r = pci_resize_resource(adev->pdev, 0, rbar_size); + r = pci_resize_resource(adev->pdev, 0, rbar_size, + (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 + : 1 << 2); if (r == -ENOSPC) dev_info(adev->dev, "Not enough PCI address space for a large BAR."); else if (r && r != -ENOTSUPP) dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); - pci_assign_unassigned_bus_resources(adev->pdev->bus); - /* When the doorbell or fb BAR isn't available we have no chance of * using the device. */ diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 7c89e5e0a277..4db24050edb0 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -239,6 +239,8 @@ i915-y += \ display/intel_cdclk.o \ display/intel_cmtg.o \ display/intel_color.o \ + display/intel_colorop.o \ + display/intel_color_pipeline.o \ display/intel_combo_phy.o \ display/intel_connector.o \ display/intel_crtc.o \ diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index a217a67ceb43..e7950655434b 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -32,6 +32,8 @@ #include "intel_display_utils.h" #include "intel_dsb.h" #include "intel_vrr.h" +#include "skl_universal_plane.h" +#include "skl_universal_plane_regs.h" struct intel_color_funcs { int (*color_check)(struct intel_atomic_state *state, @@ -87,6 +89,14 @@ struct intel_color_funcs { * Read config other than LUTs and CSCs, before them. Optional. */ void (*get_config)(struct intel_crtc_state *crtc_state); + + /* Plane CSC*/ + void (*load_plane_csc_matrix)(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state); + + /* Plane Pre/Post CSC */ + void (*load_plane_luts)(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state); }; #define CTM_COEFF_SIGN (1ULL << 63) @@ -609,6 +619,8 @@ static u16 ctm_to_twos_complement(u64 coeff, int int_bits, int frac_bits) if (CTM_COEFF_NEGATIVE(coeff)) c = -c; + int_bits = max(int_bits, 1); + c = clamp(c, -(s64)BIT(int_bits + frac_bits - 1), (s64)(BIT(int_bits + frac_bits - 1) - 1)); @@ -3836,6 +3848,266 @@ static void icl_read_luts(struct intel_crtc_state *crtc_state) } } +static void +xelpd_load_plane_csc_matrix(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + const struct drm_plane_state *state = &plane_state->uapi; + enum pipe pipe = to_intel_plane(state->plane)->pipe; + enum plane_id plane = to_intel_plane(state->plane)->id; + const struct drm_property_blob *blob = plane_state->hw.ctm; + struct drm_color_ctm_3x4 *ctm; + const u64 *input; + u16 coeffs[9] = {}; + int i, j; + + if (!icl_is_hdr_plane(display, plane) || !blob) + return; + + ctm = blob->data; + input = ctm->matrix; + + /* + * Convert fixed point S31.32 input to format supported by the + * hardware. + */ + for (i = 0, j = 0; i < ARRAY_SIZE(coeffs); i++) { + u64 abs_coeff = ((1ULL << 63) - 1) & input[j]; + + /* + * Clamp input value to min/max supported by + * hardware. + */ + abs_coeff = clamp_val(abs_coeff, 0, CTM_COEFF_4_0 - 1); + + /* sign bit */ + if (CTM_COEFF_NEGATIVE(input[j])) + coeffs[i] |= 1 << 15; + + if (abs_coeff < CTM_COEFF_0_125) + coeffs[i] |= (3 << 12) | + ILK_CSC_COEFF_FP(abs_coeff, 12); + else if (abs_coeff < CTM_COEFF_0_25) + coeffs[i] |= (2 << 12) | + ILK_CSC_COEFF_FP(abs_coeff, 11); + else if (abs_coeff < CTM_COEFF_0_5) + coeffs[i] |= (1 << 12) | + ILK_CSC_COEFF_FP(abs_coeff, 10); + else if (abs_coeff < CTM_COEFF_1_0) + coeffs[i] |= ILK_CSC_COEFF_FP(abs_coeff, 9); + else if (abs_coeff < CTM_COEFF_2_0) + coeffs[i] |= (7 << 12) | + ILK_CSC_COEFF_FP(abs_coeff, 8); + else + coeffs[i] |= (6 << 12) | + ILK_CSC_COEFF_FP(abs_coeff, 7); + + /* Skip postoffs */ + if (!((j + 2) % 4)) + j += 2; + else + j++; + } + + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 0), + coeffs[0] << 16 | coeffs[1]); + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 1), + coeffs[2] << 16); + + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 2), + coeffs[3] << 16 | coeffs[4]); + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 3), + coeffs[5] << 16); + + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 4), + coeffs[6] << 16 | coeffs[7]); + intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 5), + coeffs[8] << 16); + + intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 0), 0); + intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 1), 0); + intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 2), 0); + + /* + * Conversion from S31.32 to S0.12. BIT[12] is the signed bit + */ + intel_de_write_dsb(display, dsb, + PLANE_CSC_POSTOFF(pipe, plane, 0), + ctm_to_twos_complement(input[3], 0, 12)); + intel_de_write_dsb(display, dsb, + PLANE_CSC_POSTOFF(pipe, plane, 1), + ctm_to_twos_complement(input[7], 0, 12)); + intel_de_write_dsb(display, dsb, + PLANE_CSC_POSTOFF(pipe, plane, 2), + ctm_to_twos_complement(input[11], 0, 12)); +} + +static void +xelpd_program_plane_pre_csc_lut(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + const struct drm_plane_state *state = &plane_state->uapi; + enum pipe pipe = to_intel_plane(state->plane)->pipe; + enum plane_id plane = to_intel_plane(state->plane)->id; + const struct drm_color_lut32 *pre_csc_lut = plane_state->hw.degamma_lut->data; + u32 i, lut_size; + + if (icl_is_hdr_plane(display, plane)) { + lut_size = 128; + + intel_de_write_dsb(display, dsb, + PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, 0), + PLANE_PAL_PREC_AUTO_INCREMENT); + + if (pre_csc_lut) { + for (i = 0; i < lut_size; i++) { + u32 lut_val = drm_color_lut32_extract(pre_csc_lut[i].green, 24); + + intel_de_write_dsb(display, dsb, + PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0), + lut_val); + } + + /* Program the max register to clamp values > 1.0. */ + /* TODO: Restrict to 0x7ffffff */ + do { + intel_de_write_dsb(display, dsb, + PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0), + (1 << 24)); + } while (i++ > 130); + } else { + for (i = 0; i < lut_size; i++) { + u32 v = (i * ((1 << 24) - 1)) / (lut_size - 1); + + intel_de_write_dsb(display, dsb, + PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0), v); + } + + do { + intel_de_write_dsb(display, dsb, + PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0), + 1 << 24); + } while (i++ < 130); + } + + intel_de_write_dsb(display, dsb, PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, 0), 0); + } +} + +static void +xelpd_program_plane_post_csc_lut(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + const struct drm_plane_state *state = &plane_state->uapi; + enum pipe pipe = to_intel_plane(state->plane)->pipe; + enum plane_id plane = to_intel_plane(state->plane)->id; + const struct drm_color_lut32 *post_csc_lut = plane_state->hw.gamma_lut->data; + u32 i, lut_size, lut_val; + + if (icl_is_hdr_plane(display, plane)) { + intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, 0), + PLANE_PAL_PREC_AUTO_INCREMENT); + /* TODO: Add macro */ + intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, 0), + PLANE_PAL_PREC_AUTO_INCREMENT); + if (post_csc_lut) { + lut_size = 32; + for (i = 0; i < lut_size; i++) { + lut_val = drm_color_lut32_extract(post_csc_lut[i].green, 24); + + intel_de_write_dsb(display, dsb, + PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0), + lut_val); + } + + /* Segment 2 */ + do { + intel_de_write_dsb(display, dsb, + PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0), + (1 << 24)); + } while (i++ < 34); + } else { + /*TODO: Add for segment 0 */ + lut_size = 32; + for (i = 0; i < lut_size; i++) { + u32 v = (i * ((1 << 24) - 1)) / (lut_size - 1); + + intel_de_write_dsb(display, dsb, + PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0), v); + } + + do { + intel_de_write_dsb(display, dsb, + PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0), + 1 << 24); + } while (i++ < 34); + } + + intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, 0), 0); + intel_de_write_dsb(display, dsb, + PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, 0), 0); + } +} + +static void +xelpd_plane_load_luts(struct intel_dsb *dsb, const struct intel_plane_state *plane_state) +{ + if (plane_state->hw.degamma_lut) + xelpd_program_plane_pre_csc_lut(dsb, plane_state); + + if (plane_state->hw.gamma_lut) + xelpd_program_plane_post_csc_lut(dsb, plane_state); +} + +static u32 glk_3dlut_10(const struct drm_color_lut32 *color) +{ + return REG_FIELD_PREP(LUT_3D_DATA_RED_MASK, drm_color_lut32_extract(color->red, 10)) | + REG_FIELD_PREP(LUT_3D_DATA_GREEN_MASK, drm_color_lut32_extract(color->green, 10)) | + REG_FIELD_PREP(LUT_3D_DATA_BLUE_MASK, drm_color_lut32_extract(color->blue, 10)); +} + +static void glk_load_lut_3d(struct intel_dsb *dsb, + struct intel_crtc *crtc, + const struct drm_property_blob *blob) +{ + struct intel_display *display = to_intel_display(crtc->base.dev); + const struct drm_color_lut32 *lut = blob->data; + int i, lut_size = drm_color_lut32_size(blob); + enum pipe pipe = crtc->pipe; + + if (!dsb && intel_de_read(display, LUT_3D_CTL(pipe)) & LUT_3D_READY) { + drm_err(display->drm, "[CRTC:%d:%s] 3D LUT not ready, not loading LUTs\n", + crtc->base.base.id, crtc->base.name); + return; + } + + intel_de_write_dsb(display, dsb, LUT_3D_INDEX(pipe), LUT_3D_AUTO_INCREMENT); + for (i = 0; i < lut_size; i++) + intel_de_write_dsb(display, dsb, LUT_3D_DATA(pipe), glk_3dlut_10(&lut[i])); + intel_de_write_dsb(display, dsb, LUT_3D_INDEX(pipe), 0); +} + +static void glk_lut_3d_commit(struct intel_dsb *dsb, struct intel_crtc *crtc, bool enable) +{ + struct intel_display *display = to_intel_display(crtc); + enum pipe pipe = crtc->pipe; + u32 val = 0; + + if (!dsb && intel_de_read(display, LUT_3D_CTL(pipe)) & LUT_3D_READY) { + drm_err(display->drm, "[CRTC:%d:%s] 3D LUT not ready, not committing change\n", + crtc->base.base.id, crtc->base.name); + return; + } + + if (enable) + val = LUT_3D_ENABLE | LUT_3D_READY | LUT_3D_BIND_PLANE_1; + + intel_de_write_dsb(display, dsb, LUT_3D_CTL(pipe), val); +} + static const struct intel_color_funcs chv_color_funcs = { .color_check = chv_color_check, .color_commit_arm = i9xx_color_commit_arm, @@ -3883,6 +4155,8 @@ static const struct intel_color_funcs tgl_color_funcs = { .lut_equal = icl_lut_equal, .read_csc = icl_read_csc, .get_config = skl_get_config, + .load_plane_csc_matrix = xelpd_load_plane_csc_matrix, + .load_plane_luts = xelpd_plane_load_luts, }; static const struct intel_color_funcs icl_color_funcs = { @@ -3963,6 +4237,67 @@ static const struct intel_color_funcs ilk_color_funcs = { .get_config = ilk_get_config, }; +void intel_color_plane_commit_arm(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + struct intel_crtc *crtc = to_intel_crtc(plane_state->uapi.crtc); + + if (crtc && intel_color_crtc_has_3dlut(display, crtc->pipe)) + glk_lut_3d_commit(dsb, crtc, !!plane_state->hw.lut_3d); +} + +static void +intel_color_load_plane_csc_matrix(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + + if (display->funcs.color->load_plane_csc_matrix) + display->funcs.color->load_plane_csc_matrix(dsb, plane_state); +} + +static void +intel_color_load_plane_luts(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + + if (display->funcs.color->load_plane_luts) + display->funcs.color->load_plane_luts(dsb, plane_state); +} + +bool +intel_color_crtc_has_3dlut(struct intel_display *display, enum pipe pipe) +{ + if (DISPLAY_VER(display) >= 12) + return pipe == PIPE_A || pipe == PIPE_B; + else + return false; +} + +static void +intel_color_load_3dlut(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + struct intel_display *display = to_intel_display(plane_state); + struct intel_crtc *crtc = to_intel_crtc(plane_state->uapi.crtc); + + if (crtc && intel_color_crtc_has_3dlut(display, crtc->pipe)) + glk_load_lut_3d(dsb, crtc, plane_state->hw.lut_3d); +} + +void intel_color_plane_program_pipeline(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state) +{ + if (plane_state->hw.ctm) + intel_color_load_plane_csc_matrix(dsb, plane_state); + if (plane_state->hw.degamma_lut || plane_state->hw.gamma_lut) + intel_color_load_plane_luts(dsb, plane_state); + if (plane_state->hw.lut_3d) + intel_color_load_3dlut(dsb, plane_state); +} + void intel_color_crtc_init(struct intel_crtc *crtc) { struct intel_display *display = to_intel_display(crtc); diff --git a/drivers/gpu/drm/i915/display/intel_color.h b/drivers/gpu/drm/i915/display/intel_color.h index bf7a12ce9df0..c21b9bdf7bb8 100644 --- a/drivers/gpu/drm/i915/display/intel_color.h +++ b/drivers/gpu/drm/i915/display/intel_color.h @@ -13,7 +13,9 @@ struct intel_crtc_state; struct intel_crtc; struct intel_display; struct intel_dsb; +struct intel_plane_state; struct drm_property_blob; +enum pipe; void intel_color_init_hooks(struct intel_display *display); int intel_color_init(struct intel_display *display); @@ -40,5 +42,9 @@ bool intel_color_lut_equal(const struct intel_crtc_state *crtc_state, const struct drm_property_blob *blob2, bool is_pre_csc_lut); void intel_color_assert_luts(const struct intel_crtc_state *crtc_state); - +void intel_color_plane_program_pipeline(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state); +void intel_color_plane_commit_arm(struct intel_dsb *dsb, + const struct intel_plane_state *plane_state); +bool intel_color_crtc_has_3dlut(struct intel_display *display, enum pipe pipe); #endif /* __INTEL_COLOR_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_color_pipeline.c b/drivers/gpu/drm/i915/display/intel_color_pipeline.c new file mode 100644 index 000000000000..942d9b9c93ce --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_color_pipeline.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ +#include "intel_color.h" +#include "intel_colorop.h" +#include "intel_color_pipeline.h" +#include "intel_de.h" +#include "intel_display_types.h" +#include "skl_universal_plane.h" + +#define MAX_COLOR_PIPELINES 1 +#define PLANE_DEGAMMA_SIZE 128 +#define PLANE_GAMMA_SIZE 32 + +static +int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_enum_list *list, + enum pipe pipe) +{ + struct drm_device *dev = plane->dev; + struct intel_display *display = to_intel_display(dev); + struct drm_colorop *prev_op; + struct intel_colorop *colorop; + int ret; + + colorop = intel_colorop_create(INTEL_PLANE_CB_PRE_CSC_LUT); + + ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, + PLANE_DEGAMMA_SIZE, + DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, + DRM_COLOROP_FLAG_ALLOW_BYPASS); + + if (ret) + return ret; + + list->type = colorop->base.base.id; + list->name = kasprintf(GFP_KERNEL, "Color Pipeline %d", colorop->base.base.id); + + /* TODO: handle failures and clean up */ + prev_op = &colorop->base; + + if (DISPLAY_VER(display) >= 35 && + intel_color_crtc_has_3dlut(display, pipe) && + plane->type == DRM_PLANE_TYPE_PRIMARY) { + colorop = intel_colorop_create(INTEL_PLANE_CB_3DLUT); + + ret = drm_plane_colorop_3dlut_init(dev, &colorop->base, plane, 17, + DRM_COLOROP_LUT3D_INTERPOLATION_TETRAHEDRAL, + true); + if (ret) + return ret; + + drm_colorop_set_next_property(prev_op, &colorop->base); + + prev_op = &colorop->base; + } + + colorop = intel_colorop_create(INTEL_PLANE_CB_CSC); + ret = drm_plane_colorop_ctm_3x4_init(dev, &colorop->base, plane, + DRM_COLOROP_FLAG_ALLOW_BYPASS); + if (ret) + return ret; + + drm_colorop_set_next_property(prev_op, &colorop->base); + prev_op = &colorop->base; + + colorop = intel_colorop_create(INTEL_PLANE_CB_POST_CSC_LUT); + ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, + PLANE_GAMMA_SIZE, + DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, + DRM_COLOROP_FLAG_ALLOW_BYPASS); + if (ret) + return ret; + + drm_colorop_set_next_property(prev_op, &colorop->base); + + return 0; +} + +int intel_color_pipeline_plane_init(struct drm_plane *plane, enum pipe pipe) +{ + struct drm_device *dev = plane->dev; + struct intel_display *display = to_intel_display(dev); + struct drm_prop_enum_list pipelines[MAX_COLOR_PIPELINES]; + int len = 0; + int ret; + + /* Currently expose pipeline only for HDR planes */ + if (!icl_is_hdr_plane(display, to_intel_plane(plane)->id)) + return 0; + + /* Add pipeline consisting of transfer functions */ + ret = _intel_color_pipeline_plane_init(plane, &pipelines[len], pipe); + if (ret) + return ret; + len++; + + return drm_plane_create_color_pipeline_property(plane, pipelines, len); +} diff --git a/drivers/gpu/drm/i915/display/intel_color_pipeline.h b/drivers/gpu/drm/i915/display/intel_color_pipeline.h new file mode 100644 index 000000000000..a457d306da7f --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_color_pipeline.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef __INTEL_COLOR_PIPELINE_H__ +#define __INTEL_COLOR_PIPELINE_H__ + +struct drm_plane; +enum pipe; + +int intel_color_pipeline_plane_init(struct drm_plane *plane, enum pipe pipe); + +#endif /* __INTEL_COLOR_PIPELINE_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_color_regs.h b/drivers/gpu/drm/i915/display/intel_color_regs.h index 8eb643cfead7..c370b6029369 100644 --- a/drivers/gpu/drm/i915/display/intel_color_regs.h +++ b/drivers/gpu/drm/i915/display/intel_color_regs.h @@ -316,4 +316,33 @@ #define SKL_BOTTOM_COLOR_CSC_ENABLE REG_BIT(30) #define SKL_BOTTOM_COLOR(pipe) _MMIO_PIPE(pipe, _SKL_BOTTOM_COLOR_A, _SKL_BOTTOM_COLOR_B) +/* 3D LUT */ +#define _LUT_3D_CTL_A 0x490A4 +#define _LUT_3D_CTL_B 0x491A4 +#define LUT_3D_CTL(pipe) _MMIO_PIPE(pipe, _LUT_3D_CTL_A, _LUT_3D_CTL_B) +#define LUT_3D_ENABLE REG_BIT(31) +#define LUT_3D_READY REG_BIT(30) +#define LUT_3D_BINDING_MASK REG_GENMASK(23, 22) +#define LUT_3D_BIND_PIPE REG_FIELD_PREP(LUT_3D_BINDING_MASK, 0) +#define LUT_3D_BIND_PLANE_1 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 1) +#define LUT_3D_BIND_PLANE_2 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 2) +#define LUT_3D_BIND_PLANE_3 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 3) + +#define _LUT_3D_INDEX_A 0x490A8 +#define _LUT_3D_INDEX_B 0x491A8 +#define LUT_3D_INDEX(pipe) _MMIO_PIPE(pipe, _LUT_3D_INDEX_A, _LUT_3D_INDEX_B) +#define LUT_3D_AUTO_INCREMENT REG_BIT(13) +#define LUT_3D_INDEX_VALUE_MASK REG_GENMASK(12, 0) +#define LUT_3D_INDEX_VALUE(x) REG_FIELD_PREP(LUT_3D_INDEX_VALUE_MASK, (x)) + +#define _LUT_3D_DATA_A 0x490AC +#define _LUT_3D_DATA_B 0x491AC +#define LUT_3D_DATA(pipe) _MMIO_PIPE(pipe, _LUT_3D_DATA_A, _LUT_3D_DATA_B) +#define LUT_3D_DATA_RED_MASK REG_GENMASK(29, 20) +#define LUT_3D_DATA_GREEN_MASK REG_GENMASK(19, 10) +#define LUT_3D_DATA_BLUE_MASK REG_GENMASK(9, 0) +#define LUT_3D_DATA_RED(x) REG_FIELD_PREP(LUT_3D_DATA_RED_MASK, (x)) +#define LUT_3D_DATA_GREEN(x) REG_FIELD_PREP(LUT_3D_DATA_GREEN_MASK, (x)) +#define LUT_3D_DATA_BLUE(x) REG_FIELD_PREP(LUT_3D_DATA_BLUE_MASK, (x)) + #endif /* __INTEL_COLOR_REGS_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_colorop.c b/drivers/gpu/drm/i915/display/intel_colorop.c new file mode 100644 index 000000000000..f2fc0d8780ce --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_colorop.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ +#include "intel_colorop.h" + +struct intel_colorop *to_intel_colorop(struct drm_colorop *colorop) +{ + return container_of(colorop, struct intel_colorop, base); +} + +struct intel_colorop *intel_colorop_alloc(void) +{ + struct intel_colorop *colorop; + + colorop = kzalloc(sizeof(*colorop), GFP_KERNEL); + if (!colorop) + return ERR_PTR(-ENOMEM); + + return colorop; +} + +struct intel_colorop *intel_colorop_create(enum intel_color_block id) +{ + struct intel_colorop *colorop; + + colorop = intel_colorop_alloc(); + + if (IS_ERR(colorop)) + return colorop; + + colorop->id = id; + + return colorop; +} diff --git a/drivers/gpu/drm/i915/display/intel_colorop.h b/drivers/gpu/drm/i915/display/intel_colorop.h new file mode 100644 index 000000000000..21d58eb9f3d0 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_colorop.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef __INTEL_COLOROP_H__ +#define __INTEL_COLOROP_H__ + +#include "intel_display_types.h" + +struct intel_colorop *to_intel_colorop(struct drm_colorop *colorop); +struct intel_colorop *intel_colorop_alloc(void); +struct intel_colorop *intel_colorop_create(enum intel_color_block id); + +#endif /* __INTEL_COLOROP_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 7b4fd18c60e2..095a319f8bc9 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -7304,6 +7304,7 @@ static void intel_atomic_dsb_finish(struct intel_atomic_state *state, struct intel_display *display = to_intel_display(state); struct intel_crtc_state *new_crtc_state = intel_atomic_get_new_crtc_state(state, crtc); + unsigned int size = new_crtc_state->plane_color_changed ? 8192 : 1024; if (!new_crtc_state->use_flipq && !new_crtc_state->use_dsb && @@ -7314,10 +7315,12 @@ static void intel_atomic_dsb_finish(struct intel_atomic_state *state, * Rough estimate: * ~64 registers per each plane * 8 planes = 512 * Double that for pipe stuff and other overhead. + * ~4913 registers for 3DLUT + * ~200 color registers * 3 HDR planes */ new_crtc_state->dsb_commit = intel_dsb_prepare(state, crtc, INTEL_DSB_0, new_crtc_state->use_dsb || - new_crtc_state->use_flipq ? 1024 : 16); + new_crtc_state->use_flipq ? size : 16); if (!new_crtc_state->dsb_commit) { new_crtc_state->use_flipq = false; new_crtc_state->use_dsb = false; diff --git a/drivers/gpu/drm/i915/display/intel_display_limits.h b/drivers/gpu/drm/i915/display/intel_display_limits.h index f0fa27e365ab..cb3c9c665c44 100644 --- a/drivers/gpu/drm/i915/display/intel_display_limits.h +++ b/drivers/gpu/drm/i915/display/intel_display_limits.h @@ -138,4 +138,13 @@ enum hpd_pin { HPD_NUM_PINS }; +enum intel_color_block { + INTEL_PLANE_CB_PRE_CSC_LUT, + INTEL_PLANE_CB_CSC, + INTEL_PLANE_CB_POST_CSC_LUT, + INTEL_PLANE_CB_3DLUT, + + INTEL_CB_MAX +}; + #endif /* __INTEL_DISPLAY_LIMITS_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 38702a9e0f50..06bf8f7c0989 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -646,6 +646,7 @@ struct intel_plane_state { enum drm_color_encoding color_encoding; enum drm_color_range color_range; enum drm_scaling_filter scaling_filter; + struct drm_property_blob *ctm, *degamma_lut, *gamma_lut, *lut_3d; } hw; struct i915_vma *ggtt_vma; @@ -1391,6 +1392,9 @@ struct intel_crtc_state { u8 silence_period_sym_clocks; u8 lfps_half_cycle_num_of_syms; } alpm_state; + + /* to track changes in plane color blocks */ + bool plane_color_changed; }; enum intel_pipe_crc_source { @@ -1985,6 +1989,11 @@ struct intel_dp_mst_encoder { struct intel_connector *connector; }; +struct intel_colorop { + struct drm_colorop base; + enum intel_color_block id; +}; + static inline struct intel_encoder * intel_attached_encoder(struct intel_connector *connector) { diff --git a/drivers/gpu/drm/i915/display/intel_plane.c b/drivers/gpu/drm/i915/display/intel_plane.c index 5105e3278bc4..ab6a58530b39 100644 --- a/drivers/gpu/drm/i915/display/intel_plane.c +++ b/drivers/gpu/drm/i915/display/intel_plane.c @@ -49,6 +49,7 @@ #include "i9xx_plane_regs.h" #include "intel_cdclk.h" #include "intel_cursor.h" +#include "intel_colorop.h" #include "intel_display_rps.h" #include "intel_display_trace.h" #include "intel_display_types.h" @@ -336,6 +337,58 @@ intel_plane_copy_uapi_plane_damage(struct intel_plane_state *new_plane_state, *damage = drm_plane_state_src(&new_uapi_plane_state->uapi); } +static bool +intel_plane_colorop_replace_blob(struct intel_plane_state *plane_state, + struct intel_colorop *intel_colorop, + struct drm_property_blob *blob) +{ + if (intel_colorop->id == INTEL_PLANE_CB_CSC) + return drm_property_replace_blob(&plane_state->hw.ctm, blob); + else if (intel_colorop->id == INTEL_PLANE_CB_PRE_CSC_LUT) + return drm_property_replace_blob(&plane_state->hw.degamma_lut, blob); + else if (intel_colorop->id == INTEL_PLANE_CB_POST_CSC_LUT) + return drm_property_replace_blob(&plane_state->hw.gamma_lut, blob); + else if (intel_colorop->id == INTEL_PLANE_CB_3DLUT) + return drm_property_replace_blob(&plane_state->hw.lut_3d, blob); + + return false; +} + +static void +intel_plane_color_copy_uapi_to_hw_state(struct intel_plane_state *plane_state, + const struct intel_plane_state *from_plane_state, + struct intel_crtc *crtc) +{ + struct drm_colorop *iter_colorop, *colorop; + struct drm_colorop_state *new_colorop_state; + struct drm_atomic_state *state = plane_state->uapi.state; + struct intel_colorop *intel_colorop; + struct drm_property_blob *blob; + struct intel_atomic_state *intel_atomic_state = to_intel_atomic_state(state); + struct intel_crtc_state *new_crtc_state = intel_atomic_state ? + intel_atomic_get_new_crtc_state(intel_atomic_state, crtc) : NULL; + bool changed = false; + int i = 0; + + iter_colorop = plane_state->uapi.color_pipeline; + + while (iter_colorop) { + for_each_new_colorop_in_state(state, colorop, new_colorop_state, i) { + if (new_colorop_state->colorop == iter_colorop) { + blob = new_colorop_state->bypass ? NULL : new_colorop_state->data; + intel_colorop = to_intel_colorop(colorop); + changed |= intel_plane_colorop_replace_blob(plane_state, + intel_colorop, + blob); + } + } + iter_colorop = iter_colorop->next; + } + + if (new_crtc_state && changed) + new_crtc_state->plane_color_changed = true; +} + void intel_plane_copy_uapi_to_hw_state(struct intel_plane_state *plane_state, const struct intel_plane_state *from_plane_state, struct intel_crtc *crtc) @@ -364,6 +417,8 @@ void intel_plane_copy_uapi_to_hw_state(struct intel_plane_state *plane_state, plane_state->uapi.src = drm_plane_state_src(&from_plane_state->uapi); plane_state->uapi.dst = drm_plane_state_dest(&from_plane_state->uapi); + + intel_plane_color_copy_uapi_to_hw_state(plane_state, from_plane_state, crtc); } void intel_plane_copy_hw_state(struct intel_plane_state *plane_state, diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane.c b/drivers/gpu/drm/i915/display/skl_universal_plane.c index 89c8003ccfe7..ee8e24497d2c 100644 --- a/drivers/gpu/drm/i915/display/skl_universal_plane.c +++ b/drivers/gpu/drm/i915/display/skl_universal_plane.c @@ -11,6 +11,8 @@ #include "pxp/intel_pxp.h" #include "intel_bo.h" +#include "intel_color.h" +#include "intel_color_pipeline.h" #include "intel_de.h" #include "intel_display_irq.h" #include "intel_display_regs.h" @@ -1275,6 +1277,18 @@ static u32 glk_plane_color_ctl(const struct intel_plane_state *plane_state) if (plane_state->force_black) plane_color_ctl |= PLANE_COLOR_PLANE_CSC_ENABLE; + if (plane_state->hw.degamma_lut) + plane_color_ctl |= PLANE_COLOR_PRE_CSC_GAMMA_ENABLE; + + if (plane_state->hw.ctm) + plane_color_ctl |= PLANE_COLOR_PLANE_CSC_ENABLE; + + if (plane_state->hw.gamma_lut) { + plane_color_ctl &= ~PLANE_COLOR_PLANE_GAMMA_DISABLE; + if (drm_color_lut32_size(plane_state->hw.gamma_lut) != 32) + plane_color_ctl |= PLANE_COLOR_POST_CSC_GAMMA_MULTSEG_ENABLE; + } + return plane_color_ctl; } @@ -1556,6 +1570,8 @@ icl_plane_update_noarm(struct intel_dsb *dsb, plane_color_ctl = plane_state->color_ctl | glk_plane_color_ctl_crtc(crtc_state); + intel_color_plane_program_pipeline(dsb, plane_state); + /* The scaler will handle the output position */ if (plane_state->scaler_id >= 0) { crtc_x = 0; @@ -1657,6 +1673,8 @@ icl_plane_update_arm(struct intel_dsb *dsb, icl_plane_update_sel_fetch_arm(dsb, plane, crtc_state, plane_state); + intel_color_plane_commit_arm(dsb, plane_state); + /* * In order to have FBC for fp16 formats pixel normalizer block must be * active. Check if pixel normalizer block need to be enabled for FBC. @@ -3001,6 +3019,9 @@ skl_universal_plane_create(struct intel_display *display, DRM_COLOR_YCBCR_BT709, DRM_COLOR_YCBCR_LIMITED_RANGE); + if (DISPLAY_VER(display) >= 12) + intel_color_pipeline_plane_init(&plane->base, pipe); + drm_plane_create_alpha_property(&plane->base); drm_plane_create_blend_mode_property(&plane->base, BIT(DRM_MODE_BLEND_PIXEL_NONE) | diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h b/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h index 6f815b231340..6fd4da9f63cf 100644 --- a/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h +++ b/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h @@ -254,6 +254,8 @@ #define PLANE_COLOR_PIPE_CSC_ENABLE REG_BIT(23) /* Pre-ICL */ #define PLANE_COLOR_PLANE_CSC_ENABLE REG_BIT(21) /* ICL+ */ #define PLANE_COLOR_INPUT_CSC_ENABLE REG_BIT(20) /* ICL+ */ +#define PLANE_COLOR_POST_CSC_GAMMA_MULTSEG_ENABLE REG_BIT(15) /* TGL+ */ +#define PLANE_COLOR_PRE_CSC_GAMMA_ENABLE REG_BIT(14) #define PLANE_COLOR_CSC_MODE_MASK REG_GENMASK(19, 17) #define PLANE_COLOR_CSC_MODE_BYPASS REG_FIELD_PREP(PLANE_COLOR_CSC_MODE_MASK, 0) #define PLANE_COLOR_CSC_MODE_YUV601_TO_RGB601 REG_FIELD_PREP(PLANE_COLOR_CSC_MODE_MASK, 1) @@ -290,6 +292,119 @@ _PLANE_INPUT_CSC_POSTOFF_HI_1_A, _PLANE_INPUT_CSC_POSTOFF_HI_1_B, \ _PLANE_INPUT_CSC_POSTOFF_HI_2_A, _PLANE_INPUT_CSC_POSTOFF_HI_2_B) +#define _MMIO_PLANE_GAMC(plane, i, a, b) _MMIO(_PIPE(plane, a, b) + (i) * 4) + +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_A 0x70160 +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_B 0x71160 +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_A 0x70260 +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_B 0x71260 +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_A, \ + _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_B) +#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_A, \ + _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_B) +#define PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1(pipe), \ + _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2(pipe)) + +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_A 0x70164 +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_B 0x71164 +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_A 0x70264 +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_B 0x71264 +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_A, \ + _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_B) +#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_A, \ + _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_B) +#define PLANE_POST_CSC_GAMC_SEG0_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1(pipe), \ + _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2(pipe)) + +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1_A 0x701d8 +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1_B 0x711d8 +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2_A 0x702d8 +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2_B 0x712d8 +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_ENH_1_A, \ + _PLANE_POST_CSC_GAMC_INDEX_ENH_1_B) +#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_ENH_2_A, \ + _PLANE_POST_CSC_GAMC_INDEX_ENH_2_B) +#define PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_INDEX_ENH_1(pipe), \ + _PLANE_POST_CSC_GAMC_INDEX_ENH_2(pipe)) + +#define _PLANE_POST_CSC_GAMC_DATA_ENH_1_A 0x701dc +#define _PLANE_POST_CSC_GAMC_DATA_ENH_1_B 0x711dc +#define _PLANE_POST_CSC_GAMC_DATA_ENH_2_A 0x702dc +#define _PLANE_POST_CSC_GAMC_DATA_ENH_2_B 0x712dc +#define _PLANE_POST_CSC_GAMC_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_ENH_1_A, \ + _PLANE_POST_CSC_GAMC_DATA_ENH_1_B) +#define _PLANE_POST_CSC_GAMC_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_ENH_2_A, \ + _PLANE_POST_CSC_GAMC_DATA_ENH_2_B) +#define PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_DATA_ENH_1(pipe), \ + _PLANE_POST_CSC_GAMC_DATA_ENH_2(pipe)) + +#define _PLANE_POST_CSC_GAMC_INDEX_1_A 0x704d8 +#define _PLANE_POST_CSC_GAMC_INDEX_1_B 0x714d8 +#define _PLANE_POST_CSC_GAMC_INDEX_2_A 0x705d8 +#define _PLANE_POST_CSC_GAMC_INDEX_2_B 0x715d8 +#define _PLANE_POST_CSC_GAMC_INDEX_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_1_A, \ + _PLANE_POST_CSC_GAMC_INDEX_1_B) +#define _PLANE_POST_CSC_GAMC_INDEX_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_2_A, \ + _PLANE_POST_CSC_GAMC_INDEX_2_B) +#define PLANE_POST_CSC_GAMC_INDEX(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_INDEX_1(pipe), \ + _PLANE_POST_CSC_GAMC_INDEX_2(pipe)) + +#define _PLANE_POST_CSC_GAMC_DATA_1_A 0x704dc +#define _PLANE_POST_CSC_GAMC_DATA_1_B 0x714dc +#define _PLANE_POST_CSC_GAMC_DATA_2_A 0x705dc +#define _PLANE_POST_CSC_GAMC_DATA_2_B 0x715dc +#define _PLANE_POST_CSC_GAMC_DATA_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_1_A, \ + _PLANE_POST_CSC_GAMC_DATA_1_B) +#define _PLANE_POST_CSC_GAMC_DATA_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_2_A, \ + _PLANE_POST_CSC_GAMC_DATA_2_B) +#define PLANE_POST_CSC_GAMC_DATA(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_DATA_1(pipe), \ + _PLANE_POST_CSC_GAMC_DATA_2(pipe)) + +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_A 0x701d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_B 0x711d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_A 0x702d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_B 0x712d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_A, \ + _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_B) +#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_A, \ + _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_B) +#define PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_INDEX_ENH_1(pipe), \ + _PLANE_PRE_CSC_GAMC_INDEX_ENH_2(pipe)) +#define PLANE_PAL_PREC_AUTO_INCREMENT REG_BIT(10) + +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1_A 0x701d4 +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1_B 0x711d4 +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2_A 0x702d4 +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2_B 0x712d4 +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_ENH_1_A, \ + _PLANE_PRE_CSC_GAMC_DATA_ENH_1_B) +#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_ENH_2_A, \ + _PLANE_PRE_CSC_GAMC_DATA_ENH_2_B) +#define PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_DATA_ENH_1(pipe), \ + _PLANE_PRE_CSC_GAMC_DATA_ENH_2(pipe)) + +#define _PLANE_PRE_CSC_GAMC_INDEX_1_A 0x704d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_1_B 0x714d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_2_A 0x705d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_2_B 0x715d0 +#define _PLANE_PRE_CSC_GAMC_INDEX_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_1_A, \ + _PLANE_PRE_CSC_GAMC_INDEX_1_B) +#define _PLANE_PRE_CSC_GAMC_INDEX_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_2_A, \ + _PLANE_PRE_CSC_GAMC_INDEX_2_B) +#define PLANE_PRE_CSC_GAMC_INDEX(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_INDEX_1(pipe), \ + _PLANE_PRE_CSC_GAMC_INDEX_2(pipe)) + +#define _PLANE_PRE_CSC_GAMC_DATA_1_A 0x704d4 +#define _PLANE_PRE_CSC_GAMC_DATA_1_B 0x714d4 +#define _PLANE_PRE_CSC_GAMC_DATA_2_A 0x705d4 +#define _PLANE_PRE_CSC_GAMC_DATA_2_B 0x715d4 +#define _PLANE_PRE_CSC_GAMC_DATA_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_1_A, \ + _PLANE_PRE_CSC_GAMC_DATA_1_B) +#define _PLANE_PRE_CSC_GAMC_DATA_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_2_A, \ + _PLANE_PRE_CSC_GAMC_DATA_2_B) +#define PLANE_PRE_CSC_GAMC_DATA(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_DATA_1(pipe), \ + _PLANE_PRE_CSC_GAMC_DATA_2(pipe)) + #define _PLANE_CSC_RY_GY_1_A 0x70210 #define _PLANE_CSC_RY_GY_2_A 0x70310 #define _PLANE_CSC_RY_GY_1_B 0x71210 diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c index 890183de2277..a30060fd4429 100644 --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c @@ -20,16 +20,6 @@ #include "gt/intel_gt_regs.h" #ifdef CONFIG_64BIT -static void _release_bars(struct pci_dev *pdev) -{ - int resno; - - for (resno = PCI_STD_RESOURCES; resno < PCI_STD_RESOURCE_END; resno++) { - if (pci_resource_len(pdev, resno)) - pci_release_resource(pdev, resno); - } -} - static void _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size) { @@ -37,9 +27,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size) int bar_size = pci_rebar_bytes_to_size(size); int ret; - _release_bars(pdev); - - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n", resno, 1 << bar_size, ERR_PTR(ret)); @@ -63,16 +51,12 @@ static void i915_resize_lmem_bar(struct drm_i915_private *i915, resource_size_t current_size = roundup_pow_of_two(pci_resource_len(pdev, GEN12_LMEM_BAR)); if (i915->params.lmem_bar_size) { - u32 bar_sizes; - - rebar_size = i915->params.lmem_bar_size * - (resource_size_t)SZ_1M; - bar_sizes = pci_rebar_get_possible_sizes(pdev, GEN12_LMEM_BAR); - + rebar_size = i915->params.lmem_bar_size * (resource_size_t)SZ_1M; if (rebar_size == current_size) return; - if (!(bar_sizes & BIT(pci_rebar_bytes_to_size(rebar_size))) || + if (!pci_rebar_size_supported(pdev, GEN12_LMEM_BAR, + pci_rebar_bytes_to_size(rebar_size)) || rebar_size >= roundup_pow_of_two(lmem_size)) { rebar_size = lmem_size; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index bbeba0d3fca8..3abc9206f1a8 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1141,6 +1141,122 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, return func(vgpu, index, start, count, flags, data); } +static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_region_info_cap_sparse_mmap *sparse = NULL; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + int nr_areas = 1; + int cap_type_id; + unsigned int i; + int ret; + + switch (info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->gvt->device_info.cfg_space_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->cfg_space.bar[info->index].size; + if (!info->size) { + info->flags = 0; + break; + } + + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + break; + case VFIO_PCI_BAR2_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info->size = gvt_aperture_sz(vgpu->gvt); + + sparse = kzalloc(struct_size(sparse, areas, nr_areas), + GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->areas[0].offset = + PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].size = vgpu_aperture_sz(vgpu); + break; + + case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info bar:%d\n", info->index); + break; + + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info index:%d\n", info->index); + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; + + if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + return -EINVAL; + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + + i = info->index - VFIO_PCI_NUM_REGIONS; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->region[i].size; + info->flags = vgpu->region[i].flags; + + cap_type.type = vgpu->region[i].type; + cap_type.subtype = vgpu->region[i].subtype; + + ret = vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + } + } + + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + ret = -EINVAL; + if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) { + ret = vfio_info_add_capability( + caps, &sparse->header, + struct_size(sparse, areas, sparse->nr_areas)); + } + if (ret) { + kfree(sparse); + return ret; + } + } + + kfree(sparse); + return 0; +} + static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, unsigned long arg) { @@ -1169,152 +1285,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned int i; - int ret; - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - int nr_areas = 1; - int cap_type_id; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - break; - case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); - - sparse = kzalloc(struct_size(sparse, areas, nr_areas), - GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); - sparse->areas[0].size = vgpu_aperture_sz(vgpu); - break; - - case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info bar:%d\n", info.index); - break; - - case VFIO_PCI_ROM_REGION_INDEX: - case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info index:%d\n", info.index); - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= VFIO_PCI_NUM_REGIONS + - vgpu->num_regions) - return -EINVAL; - info.index = - array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vgpu->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = - VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; - - cap_type.type = vgpu->region[i].type; - cap_type.subtype = vgpu->region[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } - } - - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { - ret = -EINVAL; - if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) - ret = vfio_info_add_capability(&caps, - &sparse->header, - struct_size(sparse, areas, - sparse->nr_areas)); - if (ret) { - kfree(sparse); - return ret; - } - } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - - kfree(sparse); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -1477,6 +1447,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, + .get_region_info_caps = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index e4b273b025d2..62be4a5227e4 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -184,6 +184,10 @@ xe-$(CONFIG_PCI_IOV) += \ xe_sriov_pf_sysfs.o \ xe_tile_sriov_pf_debugfs.o +ifdef CONFIG_XE_VFIO_PCI + xe-$(CONFIG_PCI_IOV) += xe_sriov_vfio.o +endif + # include helpers for tests even when XE is built-in ifdef CONFIG_DRM_XE_KUNIT_TEST xe-y += tests/xe_kunit_helpers.o @@ -242,6 +246,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ i915-display/intel_cdclk.o \ i915-display/intel_cmtg.o \ i915-display/intel_color.o \ + i915-display/intel_colorop.o \ + i915-display/intel_color_pipeline.o \ i915-display/intel_combo_phy.o \ i915-display/intel_connector.o \ i915-display/intel_crtc.o \ diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h index 9955397aaaa9..c7a77a3a9681 100644 --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h @@ -54,13 +54,14 @@ static inline void xe_sched_tdr_queue_imm(struct xe_gpu_scheduler *sched) static inline void xe_sched_resubmit_jobs(struct xe_gpu_scheduler *sched) { struct drm_sched_job *s_job; + bool restore_replay = false; list_for_each_entry(s_job, &sched->base.pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence; struct dma_fence *hw_fence = s_fence->parent; - if (to_xe_sched_job(s_job)->skip_emit || - (hw_fence && !dma_fence_is_signaled(hw_fence))) + restore_replay |= to_xe_sched_job(s_job)->restore_replay; + if (restore_replay || (hw_fence && !dma_fence_is_signaled(hw_fence))) sched->base.ops->run_job(s_job); } } diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 62f6cc45a764..59c5c6b4d994 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -711,7 +711,7 @@ static u64 pf_profile_fair_ggtt(struct xe_gt *gt, unsigned int num_vfs) if (num_vfs > 56) return SZ_64M - SZ_8M; - return rounddown_pow_of_two(shareable / num_vfs); + return rounddown_pow_of_two(div_u64(shareable, num_vfs)); } /** diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c index d5d918ddce4f..3174a8dee779 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -17,6 +17,7 @@ #include "xe_gt_sriov_pf_helpers.h" #include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_printk.h" +#include "xe_guc.h" #include "xe_guc_buf.h" #include "xe_guc_ct.h" #include "xe_migrate.h" @@ -1023,6 +1024,12 @@ static void action_ring_cleanup(void *arg) ptr_ring_cleanup(r, destroy_pf_packet); } +static void pf_gt_migration_check_support(struct xe_gt *gt) +{ + if (GUC_FIRMWARE_VER(>->uc.guc) < MAKE_GUC_VER(70, 54, 0)) + xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0"); +} + /** * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. * @gt: the &xe_gt @@ -1039,6 +1046,8 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_PF(xe)); + pf_gt_migration_check_support(gt); + if (!pf_migration_supported(gt)) return 0; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index d4ffdb71ef3d..ed7be50b2f72 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -822,7 +822,7 @@ static void submit_exec_queue(struct xe_exec_queue *q, struct xe_sched_job *job) xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q)); - if (!job->skip_emit || job->last_replay) { + if (!job->restore_replay || job->last_replay) { if (xe_exec_queue_is_parallel(q)) wq_item_append(q); else @@ -881,10 +881,10 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job) if (!killed_or_banned_or_wedged && !xe_sched_job_is_error(job)) { if (!exec_queue_registered(q)) register_exec_queue(q, GUC_CONTEXT_NORMAL); - if (!job->skip_emit) + if (!job->restore_replay) q->ring_ops->emit_job(job); submit_exec_queue(q, job); - job->skip_emit = false; + job->restore_replay = false; } /* @@ -2112,6 +2112,18 @@ static void guc_exec_queue_revert_pending_state_change(struct xe_guc *guc, q->guc->resume_time = 0; } +static void lrc_parallel_clear(struct xe_lrc *lrc) +{ + struct xe_device *xe = gt_to_xe(lrc->gt); + struct iosys_map map = xe_lrc_parallel_map(lrc); + int i; + + for (i = 0; i < WQ_SIZE / sizeof(u32); ++i) + parallel_write(xe, map, wq[i], + FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | + FIELD_PREP(WQ_LEN_MASK, 0)); +} + /* * This function is quite complex but only real way to ensure no state is lost * during VF resume flows. The function scans the queue state, make adjustments @@ -2135,8 +2147,8 @@ static void guc_exec_queue_pause(struct xe_guc *guc, struct xe_exec_queue *q) guc_exec_queue_revert_pending_state_change(guc, q); if (xe_exec_queue_is_parallel(q)) { - struct xe_device *xe = guc_to_xe(guc); - struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); + /* Pairs with WRITE_ONCE in __xe_exec_queue_init */ + struct xe_lrc *lrc = READ_ONCE(q->lrc[0]); /* * NOP existing WQ commands that may contain stale GGTT @@ -2144,14 +2156,14 @@ static void guc_exec_queue_pause(struct xe_guc *guc, struct xe_exec_queue *q) * seems to get confused if the WQ head/tail pointers are * adjusted. */ - for (i = 0; i < WQ_SIZE / sizeof(u32); ++i) - parallel_write(xe, map, wq[i], - FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | - FIELD_PREP(WQ_LEN_MASK, 0)); + if (lrc) + lrc_parallel_clear(lrc); } job = xe_sched_first_pending_job(sched); if (job) { + job->restore_replay = true; + /* * Adjust software tail so jobs submitted overwrite previous * position in ring buffer with new GGTT addresses. @@ -2241,17 +2253,18 @@ static void guc_exec_queue_unpause_prepare(struct xe_guc *guc, struct xe_exec_queue *q) { struct xe_gpu_scheduler *sched = &q->guc->sched; - struct drm_sched_job *s_job; struct xe_sched_job *job = NULL; + bool restore_replay = false; - list_for_each_entry(s_job, &sched->base.pending_list, list) { - job = to_xe_sched_job(s_job); - - xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d", - q->guc->id, xe_sched_job_seqno(job)); + list_for_each_entry(job, &sched->base.pending_list, drm.list) { + restore_replay |= job->restore_replay; + if (restore_replay) { + xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d", + q->guc->id, xe_sched_job_seqno(job)); - q->ring_ops->emit_job(job); - job->skip_emit = true; + q->ring_ops->emit_job(job); + job->restore_replay = true; + } } if (job) diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c index fe3e40145012..afb06598b6e1 100644 --- a/drivers/gpu/drm/xe/xe_pagefault.c +++ b/drivers/gpu/drm/xe/xe_pagefault.c @@ -102,7 +102,6 @@ retry_userptr: /* Lock VM and BOs dma-resv */ xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); - drm_exec_init(&exec, 0, 0); drm_exec_until_all_locked(&exec) { err = xe_pagefault_begin(&exec, vma, tile->mem.vram, needs_vram == 1); diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 4636e4ef9baa..9c9ea10d994c 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -1223,6 +1223,23 @@ static struct pci_driver xe_pci_driver = { #endif }; +/** + * xe_pci_to_pf_device() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev) +{ + struct drm_device *drm; + + drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver); + if (IS_ERR(drm)) + return NULL; + + return to_xe_device(drm); +} + int xe_register_pci_driver(void) { return pci_register_driver(&xe_pci_driver); diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h index 611c1209b14c..11bcc5fe2c5b 100644 --- a/drivers/gpu/drm/xe/xe_pci.h +++ b/drivers/gpu/drm/xe/xe_pci.h @@ -6,7 +6,10 @@ #ifndef _XE_PCI_H_ #define _XE_PCI_H_ +struct pci_dev; + int xe_register_pci_driver(void); void xe_unregister_pci_driver(void); +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev); #endif diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index 44924512830f..766922530265 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -726,6 +726,13 @@ static void xe_pm_runtime_lockdep_prime(void) /** * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously * @xe: xe device instance + * + * When possible, scope-based runtime PM (through guard(xe_pm_runtime)) is + * be preferred over direct usage of this function. Manual get/put handling + * should only be used when the function contains goto-based logic which + * can break scope-based handling, or when the lifetime of the runtime PM + * reference does not match a specific scope (e.g., runtime PM obtained in one + * function and released in a different one). */ void xe_pm_runtime_get(struct xe_device *xe) { @@ -758,6 +765,13 @@ void xe_pm_runtime_put(struct xe_device *xe) * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl * @xe: xe device instance * + * When possible, scope-based runtime PM (through + * ACQUIRE(xe_pm_runtime_ioctl, ...)) is be preferred over direct usage of this + * function. Manual get/put handling should only be used when the function + * contains goto-based logic which can break scope-based handling, or when the + * lifetime of the runtime PM reference does not match a specific scope (e.g., + * runtime PM obtained in one function and released in a different one). + * * Returns: Any number greater than or equal to 0 for success, negative error * code otherwise. */ @@ -827,6 +841,13 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe) * It will warn if not protected. * The reference should be put back after this function regardless, since it * will always bump the usage counter, regardless. + * + * When possible, scope-based runtime PM (through guard(xe_pm_runtime_noresume)) + * is be preferred over direct usage of this function. Manual get/put handling + * should only be used when the function contains goto-based logic which can + * break scope-based handling, or when the lifetime of the runtime PM reference + * does not match a specific scope (e.g., runtime PM obtained in one function + * and released in a different one). */ void xe_pm_runtime_get_noresume(struct xe_device *xe) { diff --git a/drivers/gpu/drm/xe/xe_pm.h b/drivers/gpu/drm/xe/xe_pm.h index f7f89a18b6fc..6b27039e7b2d 100644 --- a/drivers/gpu/drm/xe/xe_pm.h +++ b/drivers/gpu/drm/xe/xe_pm.h @@ -6,6 +6,7 @@ #ifndef _XE_PM_H_ #define _XE_PM_H_ +#include <linux/cleanup.h> #include <linux/pm_runtime.h> #define DEFAULT_VRAM_THRESHOLD 300 /* in MB */ @@ -37,4 +38,20 @@ int xe_pm_block_on_suspend(struct xe_device *xe); void xe_pm_might_block_on_suspend(void); int xe_pm_module_init(void); +static inline void __xe_pm_runtime_noop(struct xe_device *xe) {} + +DEFINE_GUARD(xe_pm_runtime, struct xe_device *, + xe_pm_runtime_get(_T), xe_pm_runtime_put(_T)) +DEFINE_GUARD(xe_pm_runtime_noresume, struct xe_device *, + xe_pm_runtime_get_noresume(_T), xe_pm_runtime_put(_T)) +DEFINE_GUARD_COND(xe_pm_runtime, _ioctl, xe_pm_runtime_get_ioctl(_T), _RET >= 0) + +/* + * Used when a function needs to release runtime PM in all possible cases + * and error paths, but the wakeref was already acquired by a different + * function (i.e., get() has already happened so only a put() is needed). + */ +DEFINE_GUARD(xe_pm_runtime_release_only, struct xe_device *, + __xe_pm_runtime_noop(_T), xe_pm_runtime_put(_T)); + #endif diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index d26612abb4ca..7c4c54fe920a 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -63,8 +63,8 @@ struct xe_sched_job { bool ring_ops_flush_tlb; /** @ggtt: mapped in ggtt. */ bool ggtt; - /** @skip_emit: skip emitting the job */ - bool skip_emit; + /** @restore_replay: job being replayed for restore */ + bool restore_replay; /** @last_replay: last job being replayed */ bool last_replay; /** @ptrs: per instance pointers. */ diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c index de06cc690fc8..6c4b16409cc9 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c @@ -46,13 +46,37 @@ bool xe_sriov_pf_migration_supported(struct xe_device *xe) { xe_assert(xe, IS_SRIOV_PF(xe)); - return xe->sriov.pf.migration.supported; + return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled; } -static bool pf_check_migration_support(struct xe_device *xe) +/** + * xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF. + * @xe: the &xe_device instance. + * @fmt: format string for the log message, to be combined with following VAs. + */ +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...) +{ + struct va_format vaf; + va_list va_args; + + xe_assert(xe, IS_SRIOV_PF(xe)); + + va_start(va_args, fmt); + vaf.fmt = fmt; + vaf.va = &va_args; + xe_sriov_notice(xe, "migration %s: %pV\n", + IS_ENABLED(CONFIG_DRM_XE_DEBUG) ? + "missing prerequisite" : "disabled", + &vaf); + va_end(va_args); + + xe->sriov.pf.migration.disabled = true; +} + +static void pf_migration_check_support(struct xe_device *xe) { - /* XXX: for now this is for feature enabling only */ - return IS_ENABLED(CONFIG_DRM_XE_DEBUG); + if (!xe_device_has_memirq(xe)) + xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support"); } static void pf_migration_cleanup(void *arg) @@ -77,7 +101,8 @@ int xe_sriov_pf_migration_init(struct xe_device *xe) xe_assert(xe, IS_SRIOV_PF(xe)); - xe->sriov.pf.migration.supported = pf_check_migration_support(xe); + pf_migration_check_support(xe); + if (!xe_sriov_pf_migration_supported(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h index b806298a0bb6..f8f408df8481 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h @@ -14,6 +14,7 @@ struct xe_sriov_packet; int xe_sriov_pf_migration_init(struct xe_device *xe); bool xe_sriov_pf_migration_supported(struct xe_device *xe); +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...); int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid, struct xe_sriov_packet *data); struct xe_sriov_packet * diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h index 363d673ee1dd..7d9a8a278d91 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h @@ -14,8 +14,8 @@ * struct xe_sriov_pf_migration - Xe device level VF migration data */ struct xe_sriov_pf_migration { - /** @supported: indicates whether VF migration feature is supported */ - bool supported; + /** @disabled: indicates whether VF migration feature is disabled */ + bool disabled; }; /** diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c new file mode 100644 index 000000000000..e9a7615bb5c5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include <drm/intel/xe_sriov_vfio.h> +#include <linux/cleanup.h> + +#include "xe_pci.h" +#include "xe_pm.h" +#include "xe_sriov_pf_control.h" +#include "xe_sriov_pf_helpers.h" +#include "xe_sriov_pf_migration.h" + +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev) +{ + return xe_pci_to_pf_device(pdev); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci"); + +bool xe_sriov_vfio_migration_supported(struct xe_device *xe) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + + return xe_sriov_pf_migration_supported(xe); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci"); + +#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \ +_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \ +{ \ + if (!IS_SRIOV_PF(xe)) \ + return -EPERM; \ + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \ + return -EINVAL; \ + \ + guard(xe_pm_runtime_noresume)(xe); \ + \ + return xe_sriov_pf_##_impl(xe, vfid); \ +} \ +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci") + +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size); + +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_read(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci"); + +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_write(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci"); diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index 0e10da790cc5..d50baefcd124 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -25,39 +25,13 @@ #include "xe_vram.h" #include "xe_vram_types.h" -#define BAR_SIZE_SHIFT 20 - -/* - * Release all the BARs that could influence/block LMEMBAR resizing, i.e. - * assigned IORESOURCE_MEM_64 BARs - */ -static void release_bars(struct pci_dev *pdev) -{ - struct resource *res; - int i; - - pci_dev_for_each_resource(pdev, res, i) { - /* Resource already un-assigned, do not reset it */ - if (!res->parent) - continue; - - /* No need to release unrelated BARs */ - if (!(res->flags & IORESOURCE_MEM_64)) - continue; - - pci_release_resource(pdev, i); - } -} - static void resize_bar(struct xe_device *xe, int resno, resource_size_t size) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); int bar_size = pci_rebar_bytes_to_size(size); int ret; - release_bars(pdev); - - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n", resno, 1 << bar_size, ERR_PTR(ret)); @@ -79,41 +53,37 @@ void xe_vram_resize_bar(struct xe_device *xe) resource_size_t current_size; resource_size_t rebar_size; struct resource *root_res; - u32 bar_size_mask; + int max_size, i; u32 pci_cmd; - int i; /* gather some relevant info */ current_size = pci_resource_len(pdev, LMEM_BAR); - bar_size_mask = pci_rebar_get_possible_sizes(pdev, LMEM_BAR); - - if (!bar_size_mask) - return; if (force_vram_bar_size < 0) return; /* set to a specific size? */ if (force_vram_bar_size) { - u32 bar_size_bit; + rebar_size = pci_rebar_bytes_to_size(force_vram_bar_size * + (resource_size_t)SZ_1M); - rebar_size = force_vram_bar_size * (resource_size_t)SZ_1M; - - bar_size_bit = bar_size_mask & BIT(pci_rebar_bytes_to_size(rebar_size)); - - if (!bar_size_bit) { + if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) { drm_info(&xe->drm, - "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n", - (u64)rebar_size >> 20, bar_size_mask, (u64)current_size >> 20); + "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n", + (u64)pci_rebar_size_to_bytes(rebar_size) >> 20, + pci_rebar_get_possible_sizes(pdev, LMEM_BAR), + (u64)current_size >> 20); return; } - rebar_size = 1ULL << (__fls(bar_size_bit) + BAR_SIZE_SHIFT); - + rebar_size = pci_rebar_size_to_bytes(rebar_size); if (rebar_size == current_size) return; } else { - rebar_size = 1ULL << (__fls(bar_size_mask) + BAR_SIZE_SHIFT); + max_size = pci_rebar_get_max_size(pdev, LMEM_BAR); + if (max_size < 0) + return; + rebar_size = pci_rebar_size_to_bytes(max_size); /* only resize if larger than current */ if (rebar_size <= current_size) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index f0323f1d6f01..794b9778816b 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -80,6 +80,7 @@ config INFINIBAND_VIRT_DMA if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS if !UML source "drivers/infiniband/hw/bnxt_re/Kconfig" +source "drivers/infiniband/hw/bng_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" source "drivers/infiniband/hw/erdma/Kconfig" diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 01bede8ba105..024df6ee239d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -34,7 +34,6 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); -#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ #define CM_DIRECT_RETRY_CTX ((void *) 1UL) #define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ @@ -1057,6 +1056,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; enum ib_cm_state old_state; + unsigned long timeout; struct cm_work *work; int ret; @@ -1167,10 +1167,9 @@ retest: xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); + timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4); do { - ret = wait_for_completion_timeout(&cm_id_priv->comp, - msecs_to_jiffies( - CM_DESTROY_ID_WAIT_TIMEOUT)); + ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout); if (!ret) /* timeout happened */ cm_destroy_id_wait_timeout(cm_id, old_state); } while (!ret); @@ -4518,7 +4517,7 @@ static int __init ib_cm_init(void) get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); - cm.wq = alloc_workqueue("ib_cm", 0, 1); + cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5b2d3ae3f9fc..95e89f5c147c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4475,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id, container_of(id, struct rdma_id_private, id); int ret; + lockdep_assert_held(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b4f3c835844a..13e8a1714bbd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -3021,7 +3021,7 @@ static int __init ib_core_init(void) { int ret = -ENOMEM; - ib_wq = alloc_workqueue("infiniband", 0, 0); + ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); if (!ib_wq) return -ENOMEM; @@ -3031,7 +3031,7 @@ static int __init ib_core_init(void) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); if (!ib_comp_wq) goto err_unbound; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index a7de6f403fca..b097cfcade1c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -175,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res, EXPORT_SYMBOL(rdma_restrack_new); /** - * rdma_restrack_add() - add object to the reource tracking database + * rdma_restrack_add() - add object to the resource tracking database * @res: resource entry */ void rdma_restrack_add(struct rdma_restrack_entry *res) @@ -277,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res) EXPORT_SYMBOL(rdma_restrack_put); /** - * rdma_restrack_del() - delete object from the reource tracking database + * rdma_restrack_del() - delete object from the resource tracking database * @res: resource entry */ void rdma_restrack_del(struct rdma_restrack_entry *res) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index f86ece701db6..ec3be65a2b88 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -366,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { xa_lock(&ctx_table); if (xa_load(&ctx_table, ctx->id) == ctx) - queue_work(system_unbound_wq, &ctx->close_work); + queue_work(system_dfl_wq, &ctx->close_work); xa_unlock(&ctx_table); } return 0; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c5b686394760..8137031c2a65 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -45,6 +45,8 @@ #include "uverbs.h" +#define RESCHED_LOOP_CNT_THRESHOLD 0x1000 + static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { bool make_dirty = umem->writable && dirty; @@ -55,10 +57,14 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, DMA_BIDIRECTIONAL, 0); - for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { unpin_user_page_range_dirty_lock(sg_page(sg), DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); + if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD)) + cond_resched(); + } + sg_free_append_table(&umem->sgt_append); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3a5f81402d2f..11b1a194de44 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; case IB_RATE_800_GBPS: return 320; + case IB_RATE_1600_GBPS: return 640; default: return -1; } } @@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; case 320: return IB_RATE_800_GBPS; + case 640: return IB_RATE_1600_GBPS; default: return IB_RATE_PORT_CURRENT; } } @@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; case IB_RATE_800_GBPS: return 850000; + case IB_RATE_1600_GBPS: return 1700000; default: return -1; } } diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index b706dc0d0263..c42b22ac3303 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ obj-$(CONFIG_INFINIBAND_IONIC) += ionic/ diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig new file mode 100644 index 000000000000..85845f72c64d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_BNG_RE + tristate "Broadcom Next generation RoCE HCA support" + depends on 64BIT + depends on INET && DCB && BNGE + help + This driver supports Broadcom Next generation + 50/100/200/400/800 gigabit RoCE HCAs. The module + will be called bng_re. To compile this driver + as a module, choose M here. diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile new file mode 100644 index 000000000000..c6aaaf853c77 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re + +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o + +bng_re-y := bng_dev.o bng_fw.o \ + bng_res.o bng_sp.o \ + bng_debugfs.o diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c new file mode 100644 index 000000000000..9ec5a8785250 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/debugfs.h> +#include <linux/pci.h> + +#include <rdma/ib_verbs.h> + +#include "bng_res.h" +#include "bng_fw.h" +#include "bnge.h" +#include "bnge_auxr.h" +#include "bng_re.h" +#include "bng_debugfs.h" + +static struct dentry *bng_re_debugfs_root; + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->aux_dev->pdev; + + rdev->dbg_root = + debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root); +} + +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bng_re_register_debugfs(void) +{ + bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL); +} + +void bng_re_unregister_debugfs(void) +{ + debugfs_remove(bng_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h new file mode 100644 index 000000000000..baef71df4242 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_DEBUGFS__ +#define __BNG_RE_DEBUGFS__ + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev); +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev); + +void bng_re_register_debugfs(void); +void bng_re_unregister_debugfs(void); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c new file mode 100644 index 000000000000..d8f8d7f7075f --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/auxiliary_bus.h> + +#include <rdma/ib_verbs.h> + +#include "bng_res.h" +#include "bng_sp.h" +#include "bng_fw.h" +#include "bnge.h" +#include "bnge_auxr.h" +#include "bng_re.h" +#include "bnge_hwrm.h" +#include "bng_debugfs.h" + +MODULE_AUTHOR("Siva Reddy Kallam <siva.kallam@broadcom.com>"); +MODULE_DESCRIPTION(BNG_RE_DESC); +MODULE_LICENSE("Dual BSD/GPL"); + +static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev, + struct bnge_auxr_dev *aux_dev) +{ + struct bng_re_dev *rdev; + + /* Allocate bng_re_dev instance */ + rdev = ib_alloc_device(bng_re_dev, ibdev); + if (!rdev) { + pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME); + return NULL; + } + + /* Assign auxiliary device specific data */ + rdev->netdev = aux_dev->net; + rdev->aux_dev = aux_dev; + rdev->adev = adev; + rdev->fn_id = rdev->aux_dev->pdev->devfn; + + return rdev; +} + + +static int bng_re_register_netdev(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev; + + aux_dev = rdev->aux_dev; + return bnge_register_dev(aux_dev, rdev->adev); +} + +static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + + if (!rdev->chip_ctx) + return; + + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + + chip_ctx = rdev->chip_ctx; + rdev->chip_ctx = NULL; + rdev->rcfw.res = NULL; + rdev->bng_res.cctx = NULL; + rdev->bng_res.pdev = NULL; + kfree(chip_ctx); +} + +static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + struct bnge_auxr_dev *aux_dev; + int rc = -ENOMEM; + + aux_dev = rdev->aux_dev; + rdev->bng_res.pdev = aux_dev->pdev; + rdev->rcfw.res = &rdev->bng_res; + chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); + if (!chip_ctx) + return -ENOMEM; + chip_ctx->chip_num = aux_dev->chip_num; + chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size; + + rdev->chip_ctx = chip_ctx; + rdev->bng_res.cctx = rdev->chip_ctx; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->bng_res.dattr = rdev->dev_attr; + + return 0; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; +} + +static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd) +{ + hdr->req_type = cpu_to_le16(opcd); + hdr->cmpl_ring = cpu_to_le16(-1); + hdr->target_id = cpu_to_le16(-1); +} + +static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg, + int msg_len, void *resp, int resp_max_len, + int timeout) +{ + fw_msg->msg = msg; + fw_msg->msg_len = msg_len; + fw_msg->resp = resp; + fw_msg->resp_max_len = resp_max_len; + fw_msg->timeout = timeout; +} + +static int bng_re_net_ring_free(struct bng_re_dev *rdev, + u16 fw_ring_id, int type) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_free_input req = {}; + struct hwrm_ring_free_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!rdev) + return rc; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); + req.ring_type = type; + req.ring_id = cpu_to_le16(fw_ring_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x", + req.ring_id, rc); + return rc; +} + +static int bng_re_net_ring_alloc(struct bng_re_dev *rdev, + struct bng_re_ring_attr *ring_attr, + u16 *fw_ring_id) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_alloc_input req = {}; + struct hwrm_ring_alloc_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); + req.enables = 0; + req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); + if (ring_attr->pages > 1) { + /* Page size is in log2 units */ + req.page_size = BNGE_PAGE_SHIFT; + req.page_tbl_depth = 1; + } + req.fbo = 0; + /* Association of ring index with doorbell index and MSIX number */ + req.logical_id = cpu_to_le16(ring_attr->lrid); + req.length = cpu_to_le32(ring_attr->depth + 1); + req.ring_type = ring_attr->type; + req.int_mode = ring_attr->mode; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + *fw_ring_id = le16_to_cpu(resp.ring_id); + + return rc; +} + +static int bng_re_stats_ctx_free(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_stat_ctx_free_input req = {}; + struct hwrm_stat_ctx_free_output resp = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); + req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x", + rc); + + return rc; +} + +static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct bng_re_stats *stats = &rdev->stats_ctx; + struct hwrm_stat_ctx_alloc_output resp = {}; + struct hwrm_stat_ctx_alloc_input req = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + stats->fw_id = BNGE_INVALID_STATS_CTX_ID; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); + req.update_period_ms = cpu_to_le32(1000); + req.stats_dma_addr = cpu_to_le64(stats->dma_map); + req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size); + req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + stats->fw_id = le32_to_cpu(resp.stat_ctx_id); + return rc; +} + +static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ver_get_output ver_get_resp = {}; + struct hwrm_ver_get_input ver_get_req = {}; + struct bng_re_chip_ctx *cctx; + struct bnge_fw_msg fw_msg = {}; + int rc; + + bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET); + ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR; + ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR; + ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE; + bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req), + (void *)&ver_get_resp, sizeof(ver_get_resp), + BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x", + rc); + return; + } + + cctx = rdev->chip_ctx; + cctx->hwrm_intf_ver = + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 | + le16_to_cpu(ver_get_resp.hwrm_intf_patch); + + cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout); + + if (!cctx->hwrm_cmd_max_timeout) + cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT; +} + +static void bng_re_dev_uninit(struct bng_re_dev *rdev) +{ + int rc; + bng_re_debugfs_rem_pdev(rdev); + + if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { + rc = bng_re_deinit_rcfw(&rdev->rcfw); + if (rc) + ibdev_warn(&rdev->ibdev, + "Failed to deinitialize RCFW: %#x", rc); + bng_re_stats_ctx_free(rdev); + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); + bng_re_disable_rcfw_channel(&rdev->rcfw); + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, + RING_ALLOC_REQ_RING_TYPE_NQ); + bng_re_free_rcfw_channel(&rdev->rcfw); + } + + kfree(rdev->nqr); + rdev->nqr = NULL; + bng_re_destroy_chip_ctx(rdev); + if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnge_unregister_dev(rdev->aux_dev); +} + +static int bng_re_dev_init(struct bng_re_dev *rdev) +{ + struct bng_re_ring_attr rattr = {}; + struct bng_re_creq_ctx *creq; + u32 db_offt; + int vid; + u8 type; + int rc; + + /* Registered a new RoCE device instance to netdev */ + rc = bng_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } + + set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + + if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) { + ibdev_err(&rdev->ibdev, + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->aux_dev->auxr_info->msix_requested); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -EINVAL; + } + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->aux_dev->auxr_info->msix_requested); + + rc = bng_re_setup_chip_ctx(rdev); + if (rc) { + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + ibdev_err(&rdev->ibdev, "Failed to get chip context\n"); + return -EINVAL; + } + + bng_re_query_hwrm_version(rdev); + + rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate RCFW Channel: %#x\n", rc); + goto fail; + } + + /* Allocate nq record memory */ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) { + bng_re_destroy_chip_ctx(rdev); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -ENOMEM; + } + + rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info, + sizeof(struct bnge_msix_info) * rdev->nqr->num_msix); + + type = RING_ALLOC_REQ_RING_TYPE_NQ; + creq = &rdev->rcfw.creq; + rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr; + rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count; + rattr.type = type; + rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; + rattr.depth = BNG_FW_CREQE_MAX_CNT - 1; + rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx; + rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); + goto free_rcfw; + } + db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector; + + rc = bng_re_enable_fw_channel(&rdev->rcfw, + vid, db_offt); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n", + rc); + goto free_ring; + } + + rc = bng_re_get_dev_attr(&rdev->rcfw); + if (rc) + goto disable_rcfw; + + bng_re_debugfs_add_pdev(rdev); + rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx, + &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate stats context: %#x\n", rc); + goto disable_rcfw; + } + + rc = bng_re_stats_ctx_alloc(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate QPLIB context: %#x\n", rc); + goto free_stats_ctx; + } + + rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to initialize RCFW: %#x\n", rc); + goto free_sctx; + } + set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags); + + return 0; +free_sctx: + bng_re_stats_ctx_free(rdev); +free_stats_ctx: + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); +disable_rcfw: + bng_re_disable_rcfw_channel(&rdev->rcfw); +free_ring: + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); +free_rcfw: + bng_re_free_rcfw_channel(&rdev->rcfw); +fail: + bng_re_dev_uninit(rdev); + return rc; +} + +static int bng_re_add_device(struct auxiliary_device *adev) +{ + struct bnge_auxr_priv *auxr_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *dev_info; + struct bng_re_dev *rdev; + int rc; + + dev_info = auxiliary_get_drvdata(adev); + + rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev); + if (!rdev) { + rc = -ENOMEM; + goto exit; + } + + dev_info->rdev = rdev; + + rc = bng_re_dev_init(rdev); + if (rc) + goto re_dev_dealloc; + + return 0; + +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); +exit: + return rc; +} + + +static void bng_re_remove_device(struct bng_re_dev *rdev, + struct auxiliary_device *aux_dev) +{ + bng_re_dev_uninit(rdev); + ib_dealloc_device(&rdev->ibdev); +} + + +static int bng_re_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct bnge_auxr_priv *aux_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *en_info; + int rc; + + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) + return -ENOMEM; + + en_info->auxr_dev = aux_priv->auxr_dev; + + auxiliary_set_drvdata(adev, en_info); + + rc = bng_re_add_device(adev); + if (rc) + kfree(en_info); + + return rc; +} + +static void bng_re_remove(struct auxiliary_device *adev) +{ + struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev); + struct bng_re_dev *rdev; + + rdev = dev_info->rdev; + + if (rdev) + bng_re_remove_device(rdev, adev); + kfree(dev_info); +} + +static const struct auxiliary_device_id bng_re_id_table[] = { + { .name = BNG_RE_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table); + +static struct auxiliary_driver bng_re_driver = { + .name = "rdma", + .probe = bng_re_probe, + .remove = bng_re_remove, + .id_table = bng_re_id_table, +}; + +static int __init bng_re_mod_init(void) +{ + int rc; + + + bng_re_register_debugfs(); + + rc = auxiliary_driver_register(&bng_re_driver); + if (rc) { + pr_err("%s: Failed to register auxiliary driver\n", + KBUILD_MODNAME); + goto unreg_debugfs; + } + return 0; +unreg_debugfs: + bng_re_unregister_debugfs(); + return rc; +} + +static void __exit bng_re_mod_exit(void) +{ + auxiliary_driver_unregister(&bng_re_driver); + bng_re_unregister_debugfs(); +} + +module_init(bng_re_mod_init); +module_exit(bng_re_mod_exit); diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c new file mode 100644 index 000000000000..7d9539113cf5 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/pci.h> + +#include "roce_hsi.h" +#include "bng_res.h" +#include "bng_fw.h" +#include "bng_sp.h" + +/** + * bng_re_map_rc - map return type based on opcode + * @opcode: roce slow path opcode + * + * case #1 + * Firmware initiated error recovery is a safe state machine and + * driver can consider all the underlying rdma resources are free. + * In this state, it is safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * case #2 + * If driver detect potential firmware stall, it is not safe state machine + * and the driver can not consider all the underlying rdma resources are + * freed. + * In this state, it is not safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * Scope of this helper function is only for case #1. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bng_re_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + kfree(rcfw->crsqe_tbl); + bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq); + bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq); + rcfw->pdev = NULL; +} + +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw) +{ + struct bng_re_hwq_attr hwq_attr = {}; + struct bng_re_sg_info sginfo = {}; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + + rcfw->pdev = res->pdev; + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + rcfw->res = res; + + sginfo.pgsize = PAGE_SIZE; + sginfo.pgshft = PAGE_SHIFT; + + hwq_attr.sginfo = &sginfo; + hwq_attr.res = rcfw->res; + hwq_attr.depth = BNG_FW_CREQE_MAX_CNT; + hwq_attr.stride = BNG_FW_CREQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_QUEUE; + + if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CREQ allocation failed\n"); + goto fail; + } + + rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT; + + sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth); + hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; + hwq_attr.stride = BNG_FW_CMDQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_CTX; + if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CMDQ allocation failed\n"); + goto fail; + } + + rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements, + sizeof(*rcfw->crsqe_tbl), GFP_KERNEL); + if (!rcfw->crsqe_tbl) + goto fail; + + spin_lock_init(&rcfw->tbl_lock); + + rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; + return 0; + +fail: + bng_re_free_rcfw_channel(rcfw); + return -ENOMEM; +} + +static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw, + struct creq_qp_event *qp_event, + u32 *num_wait) +{ + struct bng_re_hwq *hwq = &rcfw->cmdq.hwq; + struct bng_re_crsqe *crsqe; + u32 req_size; + u16 cookie; + bool is_waiter_alive; + struct pci_dev *pdev; + u32 wait_cmds = 0; + int rc = 0; + + pdev = rcfw->pdev; + switch (qp_event->event) { + case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: + dev_err(&pdev->dev, "Received QP error notification\n"); + break; + default: + /* + * Command Response + * cmdq->lock needs to be acquired to synchronie + * the command send and completion reaping. This function + * is always called with creq->lock held. Using + * the nested variant of spin_lock. + * + */ + + spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING); + cookie = le16_to_cpu(qp_event->cookie); + cookie &= BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock(&hwq->lock); + return rc; + } + + if (crsqe->is_waiter_alive) { + if (crsqe->resp) { + memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); + /* Insert write memory barrier to ensure that + * response data is copied before clearing the + * flags + */ + smp_wmb(); + } + } + + wait_cmds++; + + req_size = crsqe->req_size; + is_waiter_alive = crsqe->is_waiter_alive; + + crsqe->req_size = 0; + if (!is_waiter_alive) + crsqe->resp = NULL; + + crsqe->is_in_used = false; + + hwq->cons += req_size; + + spin_unlock(&hwq->lock); + } + *num_wait += wait_cmds; + return rc; +} + +/* function events */ +static int bng_re_process_func_event(struct bng_re_rcfw *rcfw, + struct creq_func_event *func_event) +{ + switch (func_event->event) { + case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_CQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_TQM_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR: + case CREQ_FUNC_EVENT_EVENT_TIM_ERROR: + case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST: + case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* CREQ Completion handlers */ +static void bng_re_service_creq(struct tasklet_struct *t) +{ + struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet); + struct bng_re_creq_ctx *creq = &rcfw->creq; + u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET; + struct bng_re_hwq *hwq = &creq->hwq; + struct creq_base *creqe; + u32 num_wakeup = 0; + u32 hw_polled = 0; + + /* Service the CREQ until budget is over */ + spin_lock_bh(&hwq->lock); + while (budget > 0) { + creqe = bng_re_get_qe(hwq, hwq->cons, NULL); + if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags)) + break; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); + + type = creqe->type & CREQ_BASE_TYPE_MASK; + switch (type) { + case CREQ_BASE_TYPE_QP_EVENT: + bng_re_process_qp_event + (rcfw, (struct creq_qp_event *)creqe, + &num_wakeup); + creq->stats.creq_qp_event_processed++; + break; + case CREQ_BASE_TYPE_FUNC_EVENT: + if (!bng_re_process_func_event + (rcfw, (struct creq_func_event *)creqe)) + creq->stats.creq_func_event_processed++; + else + dev_warn(&rcfw->pdev->dev, + "aeqe:%#x Not handled\n", type); + break; + default: + if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT) + dev_warn(&rcfw->pdev->dev, + "creqe with event 0x%x not handled\n", + type); + break; + } + budget--; + hw_polled++; + bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons, + 1, &creq->creq_db.dbinfo.flags); + } + + if (hw_polled) + bng_re_ring_nq_db(&creq->creq_db.dbinfo, + rcfw->res->cctx, true); + spin_unlock_bh(&hwq->lock); + if (num_wakeup) + wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); +} + +static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, + u8 opcode) +{ + struct bng_re_cmdq_ctx *cmdq; + + cmdq = &rcfw->cmdq; + + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { + dev_err(&rcfw->pdev->dev, "RCFW already initialized!"); + return -EINVAL; + } + + if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { + dev_err(&rcfw->pdev->dev, + "RCFW not initialized, reject opcode 0x%x", + opcode); + return -EOPNOTSUPP; + } + + return 0; +} + +static int __send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, u8 opcode) +{ + u32 bsize, free_slots, required_slots; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + struct bng_fw_cmdqe *cmdqe; + struct bng_re_hwq *hwq; + u32 sw_prod, cmdq_prod; + struct pci_dev *pdev; + u16 cookie; + u8 *preq; + + cmdq = &rcfw->cmdq; + hwq = &cmdq->hwq; + pdev = rcfw->pdev; + + /* Cmdq are in 16-byte units, each request can consume 1 or more + * cmdqe + */ + spin_lock_bh(&hwq->lock); + required_slots = bng_re_get_cmd_slots(msg->req); + free_slots = HWQ_FREE_SLOTS(hwq); + cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (required_slots >= free_slots) { + dev_info_ratelimited(&pdev->dev, + "CMDQ is full req/free %d/%d!", + required_slots, free_slots); + spin_unlock_bh(&hwq->lock); + return -EAGAIN; + } + __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); + + bsize = bng_re_set_cmd_slots(msg->req); + crsqe->free_slots = free_slots; + crsqe->resp = (struct creq_qp_event *)msg->resp; + crsqe->is_waiter_alive = true; + crsqe->is_in_used = true; + crsqe->opcode = opcode; + + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); + if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { + struct bng_re_rcfw_sbuf *sbuf = msg->sb; + + __set_cmdq_base_resp_addr(msg->req, msg->req_sz, + cpu_to_le64(sbuf->dma_addr)); + __set_cmdq_base_resp_size(msg->req, msg->req_sz, + ALIGN(sbuf->size, + BNG_FW_CMDQE_UNITS) / + BNG_FW_CMDQE_UNITS); + } + + preq = (u8 *)msg->req; + do { + /* Locate the next cmdq slot */ + sw_prod = HWQ_CMP(hwq->prod, hwq); + cmdqe = bng_re_get_qe(hwq, sw_prod, NULL); + /* Copy a segment of the req cmd to the cmdq */ + memset(cmdqe, 0, sizeof(*cmdqe)); + memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); + preq += min_t(u32, bsize, sizeof(*cmdqe)); + bsize -= min_t(u32, bsize, sizeof(*cmdqe)); + hwq->prod++; + } while (bsize > 0); + cmdq->seq_num++; + + cmdq_prod = hwq->prod & 0xFFFF; + if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) { + /* The very first doorbell write + * is required to set this flag + * which prompts the FW to reset + * its internal pointers + */ + cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG); + clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + } + /* ring CMDQ DB */ + wmb(); + writel(cmdq_prod, cmdq->cmdq_mbox.prod); + writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); + spin_unlock_bh(&hwq->lock); + /* Return the CREQ response pointer */ + return 0; +} + +/** + * __wait_for_resp - Don't hold the cpu context and wait for response + * @rcfw: rcfw channel instance of rdev + * @cookie: cookie to track the command + * + * Wait for command completion in sleepable context. + * + * Returns: + * 0 if command is completed by firmware. + * Non zero error code for rest of the case. + */ +static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + + cmdq = &rcfw->cmdq; + crsqe = &rcfw->crsqe_tbl[cookie]; + + do { + wait_event_timeout(cmdq->waitq, + !crsqe->is_in_used, + secs_to_jiffies(rcfw->max_timeout)); + + if (!crsqe->is_in_used) + return 0; + + bng_re_service_creq(&rcfw->creq.creq_tasklet); + + if (!crsqe->is_in_used) + return 0; + } while (true); +}; + +/** + * bng_re_rcfw_send_message - interface to send + * and complete rcfw command. + * @rcfw: rcfw channel instance of rdev + * @msg: message to send + * + * This function does not account shadow queue depth. It will send + * all the command unconditionally as long as send queue is not full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg) +{ + struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; + struct bng_re_crsqe *crsqe; + u16 cookie; + int rc; + u8 opcode; + + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); + + rc = __send_message_basic_sanity(rcfw, msg, opcode); + if (rc) + return rc == -ENXIO ? bng_re_map_rc(opcode) : rc; + + rc = __send_message(rcfw, msg, opcode); + if (rc) + return rc; + + cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) + & BNG_FW_MAX_COOKIE_VALUE; + + rc = __wait_for_resp(rcfw, cookie); + + if (rc) { + spin_lock_bh(&rcfw->cmdq.hwq.lock); + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); + spin_unlock_bh(&rcfw->cmdq.hwq.lock); + return -ETIMEDOUT; + } + + if (evnt->status) { + /* failed with status */ + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", + cookie, opcode, evnt->status); + rc = -EIO; + } + + return rc; +} + +static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_mbox *mbox; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + mbox = &rcfw->cmdq.cmdq_mbox; + + mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id); + if (!mbox->reg.bar_base) { + dev_err(&pdev->dev, + "CMDQ BAR region %d resc start is 0!\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len); + if (!mbox->reg.bar_reg) { + dev_err(&pdev->dev, + "CMDQ BAR region %d mapping failed\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + mbox->prod = (void __iomem *)(mbox->reg.bar_reg + + BNG_FW_PF_VF_COMM_PROD_OFFSET); + mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET); + return 0; +} + +static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance) +{ + struct bng_re_rcfw *rcfw = dev_instance; + struct bng_re_creq_ctx *creq; + struct bng_re_hwq *hwq; + u32 sw_cons; + + creq = &rcfw->creq; + hwq = &creq->hwq; + /* Prefetch the CREQ element */ + sw_cons = HWQ_CMP(hwq->cons, hwq); + bng_re_get_qe(hwq, sw_cons, NULL); + + tasklet_schedule(&creq->creq_tasklet); + return IRQ_HANDLED; +} + +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_res *res; + int rc; + + creq = &rcfw->creq; + res = rcfw->res; + + if (creq->irq_handler_avail) + return -EFAULT; + + creq->msix_vec = msix_vector; + if (need_init) + tasklet_setup(&creq->creq_tasklet, bng_re_service_creq); + else + tasklet_enable(&creq->creq_tasklet); + + creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s", + pci_name(res->pdev)); + if (!creq->irq_name) + return -ENOMEM; + rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0, + creq->irq_name, rcfw); + if (rc) { + kfree(creq->irq_name); + creq->irq_name = NULL; + tasklet_disable(&creq->creq_tasklet); + return rc; + } + creq->irq_handler_avail = true; + + bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); + atomic_inc(&rcfw->rcfw_intr_enabled); + + return 0; +} + +static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt) +{ + struct bng_re_creq_db *creq_db; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + creq_db = &rcfw->creq.creq_db; + + creq_db->dbinfo.flags = 0; + creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION; + creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id); + if (!creq_db->reg.bar_id) + dev_err(&pdev->dev, + "CREQ BAR region %d resc start is 0!", + creq_db->reg.bar_id); + + bar_reg = creq_db->reg.bar_base + reg_offt; + + creq_db->reg.len = BNG_FW_CREQ_DB_LEN; + creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len); + if (!creq_db->reg.bar_reg) { + dev_err(&pdev->dev, + "CREQ BAR region %d mapping failed", + creq_db->reg.bar_id); + return -ENOMEM; + } + creq_db->dbinfo.db = creq_db->reg.bar_reg; + creq_db->dbinfo.hwq = &rcfw->creq.hwq; + creq_db->dbinfo.xid = rcfw->creq.ring_id; + return 0; +} + +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill) +{ + struct bng_re_creq_ctx *creq; + + creq = &rcfw->creq; + + if (!creq->irq_handler_avail) + return; + + creq->irq_handler_avail = false; + /* Mask h/w interrupts */ + bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false); + /* Sync with last running IRQ-handler */ + synchronize_irq(creq->msix_vec); + free_irq(creq->msix_vec, rcfw); + kfree(creq->irq_name); + creq->irq_name = NULL; + atomic_set(&rcfw->rcfw_intr_enabled, 0); + if (kill) + tasklet_kill(&creq->creq_tasklet); + tasklet_disable(&creq->creq_tasklet); +} + +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_ctx *cmdq; + + creq = &rcfw->creq; + cmdq = &rcfw->cmdq; + /* Make sure the HW channel is stopped! */ + bng_re_rcfw_stop_irq(rcfw, true); + + iounmap(cmdq->cmdq_mbox.reg.bar_reg); + iounmap(creq->creq_db.reg.bar_reg); + + cmdq->cmdq_mbox.reg.bar_reg = NULL; + creq->creq_db.reg.bar_reg = NULL; + creq->msix_vec = 0; +} + +static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_mbox *mbox; + struct cmdq_init init = {0}; + + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + mbox = &cmdq->cmdq_mbox; + + init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]); + init.cmdq_size_cmdq_lvl = + cpu_to_le16(((rcfw->cmdq_depth << + CMDQ_INIT_CMDQ_SIZE_SFT) & + CMDQ_INIT_CMDQ_SIZE_MASK) | + ((cmdq->hwq.level << + CMDQ_INIT_CMDQ_LVL_SFT) & + CMDQ_INIT_CMDQ_LVL_MASK)); + init.creq_ring_id = cpu_to_le16(creq->ring_id); + /* Write to the mailbox register */ + __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4); +} + +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off) +{ + struct bng_re_cmdq_ctx *cmdq; + int rc; + + cmdq = &rcfw->cmdq; + + /* Assign defaults */ + cmdq->seq_num = 0; + set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + init_waitqueue_head(&cmdq->waitq); + + rc = bng_re_map_cmdq_mbox(rcfw); + if (rc) + return rc; + + rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off); + if (rc) + return rc; + + rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true); + if (rc) { + dev_err(&rcfw->pdev->dev, + "Failed to request IRQ for CREQ rc = 0x%x\n", rc); + bng_re_disable_rcfw_channel(rcfw); + return rc; + } + + bng_re_start_rcfw(rcfw); + return 0; +} + +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw) +{ + struct creq_deinitialize_fw_resp resp = {}; + struct cmdq_deinitialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_DEINITIALIZE_FW, + sizeof(req)); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, + sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + + clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} +static inline bool _is_hw_retx_supported(u16 dev_cap_flags) +{ + return dev_cap_flags & + (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED | + CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED); +} + +#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & + CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx) +{ + struct creq_initialize_fw_resp resp = {}; + struct cmdq_initialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + u16 flags = 0; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_INITIALIZE_FW, + sizeof(req)); + /* Supply (log-base-2-of-host-page-size - base-page-shift) + * to bono to adjust the doorbell page sizes. + */ + req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT - + BNG_FW_DBR_BASE_PAGE_SHIFT); + if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + req.flags |= cpu_to_le16(flags); + req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h new file mode 100644 index 000000000000..c89c926ec2fc --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_FW_H__ +#define __BNG_FW_H__ + +#include "bng_tlv.h" + +/* FW DB related */ +#define BNG_FW_CMDQ_TRIG_VAL 1 +#define BNG_FW_COMM_PCI_BAR_REGION 0 +#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2 +#define BNG_FW_DBR_BASE_PAGE_SHIFT 12 +#define BNG_FW_COMM_SIZE 0x104 +#define BNG_FW_COMM_BASE_OFFSET 0x600 +#define BNG_FW_COMM_TRIG_OFFSET 0x100 +#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc +#define BNG_FW_CREQ_DB_LEN 8 + +/* CREQ */ +#define BNG_FW_CREQE_MAX_CNT (64 * 1024) +#define BNG_FW_CREQE_UNITS 16 +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 +#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \ + (!!((hdr)->v & CREQ_BASE_V) == \ + !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK)) +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 + +/* CMDQ */ +struct bng_fw_cmdqe { + u8 data[16]; +}; + +#define BNG_FW_CMDQE_MAX_CNT 8192 +#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe) +#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS) + +#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1) +#define BNG_FW_CMD_IS_BLOCKING 0x8000 + +/* Crsq buf is 1024-Byte */ +struct bng_re_crsbe { + u8 data[1024]; +}; + + +static inline u32 bng_fw_cmdqe_npages(u32 depth) +{ + u32 npages; + + npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE; + if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE) + npages++; + return npages; +} + +static inline u32 bng_fw_cmdqe_page_size(u32 depth) +{ + return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE); +} +struct bng_re_cmdq_mbox { + struct bng_re_reg_desc reg; + void __iomem *prod; + void __iomem *db; +}; + +/* HWQ */ +struct bng_re_cmdq_ctx { + struct bng_re_hwq hwq; + struct bng_re_cmdq_mbox cmdq_mbox; + unsigned long flags; +#define FIRMWARE_INITIALIZED_FLAG (0) +#define FIRMWARE_STALL_DETECTED (3) +#define FIRMWARE_FIRST_FLAG (31) + wait_queue_head_t waitq; + u32 seq_num; +}; + +struct bng_re_creq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_creq_stat { + u64 creq_qp_event_processed; + u64 creq_func_event_processed; +}; + +struct bng_re_creq_ctx { + struct bng_re_hwq hwq; + struct bng_re_creq_db creq_db; + struct bng_re_creq_stat stats; + struct tasklet_struct creq_tasklet; + u16 ring_id; + int msix_vec; + bool irq_handler_avail; + char *irq_name; +}; + +struct bng_re_crsqe { + struct creq_qp_event *resp; + u32 req_size; + /* Free slots at the time of submission */ + u32 free_slots; + u8 opcode; + bool is_waiter_alive; + bool is_in_used; +}; + +struct bng_re_rcfw_sbuf { + void *sb; + dma_addr_t dma_addr; + u32 size; +}; + +/* RoCE FW Communication Channels */ +struct bng_re_rcfw { + struct pci_dev *pdev; + struct bng_re_res *res; + struct bng_re_cmdq_ctx cmdq; + struct bng_re_creq_ctx creq; + struct bng_re_crsqe *crsqe_tbl; + /* To synchronize the qp-handle hash table */ + spinlock_t tbl_lock; + u32 cmdq_depth; + /* cached from chip cctx for quick reference in slow path */ + u16 max_timeout; + atomic_t rcfw_intr_enabled; +}; + +struct bng_re_cmdqmsg { + struct cmdq_base *req; + struct creq_base *resp; + void *sb; + u32 req_sz; + u32 res_sz; + u8 block; +}; + +static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req, + u8 opcode, u8 cmd_size) +{ + req->opcode = opcode; + req->cmd_size = cmd_size; +} + +static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg, + void *req, void *resp, void *sb, + u32 req_sz, u32 res_sz, u8 block) +{ + msg->req = req; + msg->resp = resp; + msg->sb = sb; + msg->req_sz = req_sz; + msg->res_sz = res_sz; + msg->block = block; +} + +/* Get the number of command units required for the req. The + * function returns correct value only if called before + * setting using bng_re_set_cmd_slots + */ +static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_units = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_units = tlv_req->total_size; + } else { + cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_units; +} + +static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_byte = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS; + } else { + cmd_byte = req->cmd_size; + req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_byte; +} + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw); +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off); +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init); +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill); +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg); +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx); +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h new file mode 100644 index 000000000000..dae4862621a7 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_H__ +#define __BNG_RE_H__ + +#include "bng_res.h" + +#define BNG_RE_ADEV_NAME "bng_en" + +#define BNG_RE_DESC "Broadcom 800G RoCE Driver" + +#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL) + +#define BNG_RE_MIN_MSIX 2 +#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX + +#define BNG_RE_CREQ_NQ_IDX 0 + +#define BNGE_INVALID_STATS_CTX_ID -1 +/* NQ specific structures */ +struct bng_re_nq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_nq { + struct pci_dev *pdev; + struct bng_re_res *res; + char *name; + struct bng_re_hwq hwq; + struct bng_re_nq_db nq_db; + u16 ring_id; + int msix_vec; + cpumask_t mask; + struct tasklet_struct nq_tasklet; + bool requested; + int budget; + u32 load; + + struct workqueue_struct *cqn_wq; +}; + +struct bng_re_nq_record { + struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX]; + struct bng_re_nq nq[BNG_RE_MAX_MSIX]; + int num_msix; + /* serialize NQ access */ + struct mutex load_lock; +}; + +struct bng_re_en_dev_info { + struct bng_re_dev *rdev; + struct bnge_auxr_dev *auxr_dev; +}; + +struct bng_re_ring_attr { + dma_addr_t *dma_arr; + int pages; + int type; + u32 depth; + u32 lrid; /* Logical ring id */ + u8 mode; +}; + +struct bng_re_dev { + struct ib_device ibdev; + unsigned long flags; +#define BNG_RE_FLAG_NETDEV_REGISTERED 0 +#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1 + struct net_device *netdev; + struct auxiliary_device *adev; + struct bnge_auxr_dev *aux_dev; + struct bng_re_chip_ctx *chip_ctx; + int fn_id; + struct bng_re_res bng_res; + struct bng_re_rcfw rcfw; + struct bng_re_nq_record *nqr; + /* Device Resources */ + struct bng_re_dev_attr *dev_attr; + struct dentry *dbg_root; + struct bng_re_stats stats_ctx; +}; + +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c new file mode 100644 index 000000000000..c50823758b53 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include <linux/pci.h> +#include <linux/vmalloc.h> +#include <rdma/ib_umem.h> + +#include <linux/bnxt/hsi.h> +#include "bng_res.h" +#include "roce_hsi.h" + +/* Stats */ +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats) +{ + if (stats->dma) { + dma_free_coherent(&pdev->dev, stats->size, + stats->dma, stats->dma_map); + } + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; +} + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; + stats->size = cctx->hw_stats_size; + stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, + &stats->dma_map, GFP_KERNEL); + if (!stats->dma) + return -ENOMEM; + + return 0; +} + +static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl) +{ + struct pci_dev *pdev = res->pdev; + int i; + + for (i = 0; i < pbl->pg_count; i++) { + if (pbl->pg_arr[i]) + dma_free_coherent(&pdev->dev, pbl->pg_size, + (void *)((unsigned long) + pbl->pg_arr[i] & + PAGE_MASK), + pbl->pg_map_arr[i]); + else + dev_warn(&pdev->dev, + "PBL free pg_arr[%d] empty?!\n", i); + pbl->pg_arr[i] = NULL; + } + + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + vfree(pbl->pg_map_arr); + pbl->pg_map_arr = NULL; + pbl->pg_count = 0; + pbl->pg_size = 0; +} + +static int bng_alloc_pbl(struct bng_re_res *res, + struct bng_re_pbl *pbl, + struct bng_re_sg_info *sginfo) +{ + struct pci_dev *pdev = res->pdev; + u32 pages; + int i; + + if (sginfo->nopte) + return 0; + pages = sginfo->npages; + + /* page ptr arrays */ + pbl->pg_arr = vmalloc_array(pages, sizeof(void *)); + if (!pbl->pg_arr) + return -ENOMEM; + + pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t)); + if (!pbl->pg_map_arr) { + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + return -ENOMEM; + } + pbl->pg_count = 0; + pbl->pg_size = sginfo->pgsize; + + for (i = 0; i < pages; i++) { + pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev, + pbl->pg_size, + &pbl->pg_map_arr[i], + GFP_KERNEL); + if (!pbl->pg_arr[i]) + goto fail; + pbl->pg_count++; + } + + return 0; +fail: + bng_free_pbl(res, pbl); + return -ENOMEM; +} + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq) +{ + int i; + + if (!hwq->max_elements) + return; + if (hwq->level >= BNG_PBL_LVL_MAX) + return; + + for (i = 0; i < hwq->level + 1; i++) + bng_free_pbl(res, &hwq->pbl[i]); + + hwq->level = BNG_PBL_LVL_MAX; + hwq->max_elements = 0; + hwq->element_size = 0; + hwq->prod = 0; + hwq->cons = 0; +} + +/* All HWQs are power of 2 in size */ +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr) +{ + u32 npages, pg_size; + struct bng_re_sg_info sginfo = {}; + u32 depth, stride, npbl, npde; + dma_addr_t *src_phys_ptr, **dst_virt_ptr; + struct bng_re_res *res; + struct pci_dev *pdev; + int i, rc, lvl; + + res = hwq_attr->res; + pdev = res->pdev; + pg_size = hwq_attr->sginfo->pgsize; + hwq->level = BNG_PBL_LVL_MAX; + + depth = roundup_pow_of_two(hwq_attr->depth); + stride = roundup_pow_of_two(hwq_attr->stride); + + npages = (depth * stride) / pg_size; + if ((depth * stride) % pg_size) + npages++; + if (!npages) + return -EINVAL; + hwq_attr->sginfo->npages = npages; + + if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { + /* This request is Level 0, map PTE */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_0; + goto done; + } + + if (npages >= MAX_PBL_LVL_0_PGS) { + if (npages > MAX_PBL_LVL_1_PGS) { + u32 flag = PTU_PTE_VALID; + /* 2 levels of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + npde = npbl >> MAX_PDL_LVL_SHIFT; + if (npbl % BIT(MAX_PDL_LVL_SHIFT)) + npde++; + /* Alloc PDE pages */ + sginfo.pgsize = npde * pg_size; + sginfo.npages = 1; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + + /* Alloc PBL pages */ + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo); + if (rc) + goto fail; + /* Fill PDL with PBL page pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_2; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBLs with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) { + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | PTU_PTE_VALID; + } + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_2].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } else { /* pages < 512 npbl = 1, npde = 0 */ + u32 flag = PTU_PTE_VALID; + + /* 1 level of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + /* Alloc PBL page */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_1; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBL with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | flag; + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_1].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } + } +done: + hwq->prod = 0; + hwq->cons = 0; + hwq->pdev = pdev; + hwq->depth = hwq_attr->depth; + hwq->max_elements = hwq->depth; + hwq->element_size = stride; + hwq->qe_ppg = pg_size / stride; + /* For direct access to the elements */ + lvl = hwq->level; + if (hwq_attr->sginfo->nopte && hwq->level) + lvl = hwq->level - 1; + hwq->pbl_ptr = hwq->pbl[lvl].pg_arr; + hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr; + spin_lock_init(&hwq->lock); + + return 0; +fail: + bng_re_free_hwq(res, hwq); + return -ENOMEM; +} diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h new file mode 100644 index 000000000000..9997f86d6a0e --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RES_H__ +#define __BNG_RES_H__ + +#include "roce_hsi.h" + +#define BNG_ROCE_FW_MAX_TIMEOUT 60 + +#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) +#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1) +#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) +#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG) + +#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1)) +#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \ + ((HWQ_CMP(hwq->prod, hwq)\ + - HWQ_CMP(hwq->cons, hwq))\ + & (hwq->max_elements - 1))) + +#define MAX_PBL_LVL_0_PGS 1 +#define MAX_PBL_LVL_1_PGS 512 +#define MAX_PBL_LVL_1_PGS_SHIFT 9 +#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256 +#define MAX_PBL_LVL_2_PGS (256 * 512) +#define MAX_PDL_LVL_SHIFT 9 + +#define BNG_RE_DBR_VALID (0x1UL << 26) +#define BNG_RE_DBR_EPOCH_SHIFT 24 +#define BNG_RE_DBR_TOGGLE_SHIFT 25 + +#define BNG_MAX_TQM_ALLOC_REQ 48 + +struct bng_re_reg_desc { + u8 bar_id; + resource_size_t bar_base; + unsigned long offset; + void __iomem *bar_reg; + size_t len; +}; + +struct bng_re_db_info { + void __iomem *db; + void __iomem *priv_db; + struct bng_re_hwq *hwq; + u32 xid; + u32 max_slot; + u32 flags; + u8 toggle; +}; + +enum bng_re_db_info_flags_mask { + BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL, + BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL, + BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL, + BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL, +}; + +enum bng_re_db_epoch_flag_shift { + BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT, + BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1), +}; + +struct bng_re_chip_ctx { + u16 chip_num; + u16 hw_stats_size; + u64 hwrm_intf_ver; + u16 hwrm_cmd_max_timeout; +}; + +struct bng_re_pbl { + u32 pg_count; + u32 pg_size; + void **pg_arr; + dma_addr_t *pg_map_arr; +}; + +enum bng_re_pbl_lvl { + BNG_PBL_LVL_0, + BNG_PBL_LVL_1, + BNG_PBL_LVL_2, + BNG_PBL_LVL_MAX +}; + +enum bng_re_hwq_type { + BNG_HWQ_TYPE_CTX, + BNG_HWQ_TYPE_QUEUE +}; + +struct bng_re_sg_info { + u32 npages; + u32 pgshft; + u32 pgsize; + bool nopte; +}; + +struct bng_re_hwq_attr { + struct bng_re_res *res; + struct bng_re_sg_info *sginfo; + enum bng_re_hwq_type type; + u32 depth; + u32 stride; + u32 aux_stride; + u32 aux_depth; +}; + +struct bng_re_hwq { + struct pci_dev *pdev; + /* lock to protect hwq */ + spinlock_t lock; + struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1]; + /* Valid values: 0, 1, 2 */ + enum bng_re_pbl_lvl level; + /* PBL entries */ + void **pbl_ptr; + /* PBL dma_addr */ + dma_addr_t *pbl_dma_ptr; + u32 max_elements; + u32 depth; + u16 element_size; + u32 prod; + u32 cons; + /* queue entry per page */ + u16 qe_ppg; +}; + +struct bng_re_stats { + dma_addr_t dma_map; + void *dma; + u32 size; + u32 fw_id; +}; + +struct bng_re_res { + struct pci_dev *pdev; + struct bng_re_chip_ctx *cctx; + struct bng_re_dev_attr *dattr; +}; + +static inline void *bng_re_get_qe(struct bng_re_hwq *hwq, + u32 indx, u64 *pg) +{ + u32 pg_num, pg_idx; + + pg_num = (indx / hwq->qe_ppg); + pg_idx = (indx % hwq->qe_ppg); + if (pg) + *pg = (u64)&hwq->pbl_ptr[pg_num]; + return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx); +} + +#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \ + (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \ + (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \ + (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT))) + +static inline void bng_re_ring_db(struct bng_re_db_info *info, + u32 type) +{ + u64 key = 0; + u32 indx; + u8 toggle = 0; + + if (type == DBC_DBC_TYPE_CQ_ARMALL || + type == DBC_DBC_TYPE_CQ_ARMSE) + toggle = info->toggle; + + indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) | + ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) << + BNG_RE_DB_EPOCH_CONS_SHIFT); + + key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle); + writeq(key, info->db); +} + +static inline void bng_re_ring_nq_db(struct bng_re_db_info *info, + struct bng_re_chip_ctx *cctx, + bool arm) +{ + u32 type; + + type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; + bng_re_ring_db(info, type); +} + +static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt, + u32 *dbinfo_flags) +{ + /* move cons and update toggle/epoch if wrap around */ + *cons += cnt; + if (*cons >= max_elements) { + *cons %= max_elements; + *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT; + } +} + +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq); + +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr); + +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats); + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c new file mode 100644 index 000000000000..83099e05328d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/interrupt.h> +#include <linux/pci.h> + +#include "bng_res.h" +#include "bng_fw.h" +#include "bng_sp.h" +#include "bng_tlv.h" + +static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw) +{ + u16 pcie_ctl2 = 0; + + pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2); + return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); +} + +static void bng_re_query_version(struct bng_re_rcfw *rcfw, + char *fw_ver) +{ + struct creq_query_version_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct cmdq_query_version req = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_VERSION, + sizeof(req)); + + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return; + fw_ver[0] = resp.fw_maj; + fw_ver[1] = resp.fw_minor; + fw_ver[2] = resp.fw_bld; + fw_ver[3] = resp.fw_rsvd; +} + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw) +{ + struct bng_re_dev_attr *attr = rcfw->res->dattr; + struct creq_query_func_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct creq_query_func_resp_sb *sb; + struct bng_re_rcfw_sbuf sbuf; + struct cmdq_query_func req = {}; + u8 *tqm_alloc; + int i, rc; + u32 temp; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_FUNC, + sizeof(req)); + + sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + sb = sbuf.sb; + req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS; + bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + goto bail; + /* Extract the context from the side buffer */ + attr->max_qp = le32_to_cpu(sb->max_qp); + /* max_qp value reported by FW doesn't include the QP1 */ + attr->max_qp += 1; + attr->max_qp_rd_atom = + sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom; + attr->max_qp_init_rd_atom = + sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; + + /* Adjust for max_qp_wqes for variable wqe */ + attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1); + + attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE); + attr->max_cq = le32_to_cpu(sb->max_cq); + attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + attr->max_cq_sges = attr->max_qp_sges; + attr->max_mr = le32_to_cpu(sb->max_mr); + attr->max_mw = le32_to_cpu(sb->max_mw); + + attr->max_mr_size = le64_to_cpu(sb->max_mr_size); + attr->max_pd = 64 * 1024; + attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp); + attr->max_ah = le32_to_cpu(sb->max_ah); + + attr->max_srq = le16_to_cpu(sb->max_srq); + attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; + attr->max_srq_sges = sb->max_srq_sge; + attr->max_pkey = 1; + attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); + attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); + + bng_re_query_version(rcfw, attr->fw_ver); + for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) { + temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); + tqm_alloc = (u8 *)&temp; + attr->tqm_alloc_reqs[i * 4] = *tqm_alloc; + attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); + } + + attr->max_dpi = le32_to_cpu(sb->max_dpi); + attr->is_atomic = bng_re_is_atomic_cap(rcfw); +bail: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, + sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h new file mode 100644 index 000000000000..e15190515ed1 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_SP_H__ +#define __BNG_SP_H__ + +#include "bng_fw.h" + +#define BNG_VAR_MAX_WQE 4352 +#define BNG_VAR_MAX_SGE 13 + +struct bng_re_dev_attr { +#define FW_VER_ARR_LEN 4 + u8 fw_ver[FW_VER_ARR_LEN]; +#define BNG_RE_NUM_GIDS_SUPPORTED 256 + u16 max_sgid; + u16 max_mrw; + u32 max_qp; +#define BNG_RE_MAX_OUT_RD_ATOM 126 + u32 max_qp_rd_atom; + u32 max_qp_init_rd_atom; + u32 max_qp_wqes; + u32 max_qp_sges; + u32 max_cq; + u32 max_cq_wqes; + u32 max_cq_sges; + u32 max_mr; + u64 max_mr_size; + u32 max_pd; + u32 max_mw; + u32 max_raw_ethy_qp; + u32 max_ah; + u32 max_srq; + u32 max_srq_wqes; + u32 max_srq_sges; + u32 max_pkey; + u32 max_inline_data; + u32 l2_db_size; + u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ]; + bool is_atomic; + u16 dev_cap_flags; + u16 dev_cap_flags2; + u32 max_dpi; +}; + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h new file mode 100644 index 000000000000..278f4922962d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_tlv.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +#ifndef __BNG_TLV_H__ +#define __BNG_TLV_H__ + +#include "roce_hsi.h" + +struct roce_tlv { + struct tlv tlv; + u8 total_size; // in units of 16 byte chunks + u8 unused[7]; // for 16 byte alignment +}; + +/* + * TLV size in units of 16 byte chunks + */ +#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16) +/* + * TLV length in bytes + */ +#define TLV_BYTES (TLV_SIZE * 16) + +#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP) +#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES]) + +static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode; + else + return req->opcode; +} + +static inline void __set_cmdq_base_opcode(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val; + else + req->opcode = val; +} + +static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie; + else + return req->cookie; +} + +static inline void __set_cmdq_base_cookie(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val; + else + req->cookie = val; +} + +static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr; + else + return req->resp_addr; +} + +static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req, + u32 size, __le64 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val; + else + req->resp_addr = val; +} + +static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size; + else + return req->resp_size; +} + +static inline void __set_cmdq_base_resp_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val; + else + req->resp_size = val; +} + +static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct roce_tlv *)(req))->total_size; + else + return req->cmd_size; +} + +static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val; + else + req->cmd_size = val; +} + +static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->flags; + else + return req->flags; +} + +static inline void __set_cmdq_base_flags(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val; + else + req->flags = val; +} + +#endif /* __BNG_TLV_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3485e495ac6a..3a7ce4729fcf 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -224,6 +224,8 @@ struct bnxt_re_dev { struct workqueue_struct *dcb_wq; struct dentry *cc_config; struct bnxt_re_dbg_cc_config_params *cc_config_params; + struct dentry *cq_coal_cfg; + struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params; #define BNXT_VPD_FLD_LEN 32 char board_partno[BNXT_VPD_FLD_LEN]; /* RoCE mirror */ diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index be5e9b5ca2f0..88817c86ae24 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -23,6 +23,14 @@ static struct dentry *bnxt_re_debugfs_root; +static const char * const bnxt_re_cq_coal_str[] = { + "buf_maxtime", + "normal_maxbuf", + "during_maxbuf", + "en_ring_idle_mode", + "enable", +}; + static const char * const bnxt_re_cc_gen0_name[] = { "enable_cc", "run_avg_weight_g", @@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev) debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops); } +static ssize_t cq_coal_cfg_write(struct file *file, + const char __user *buf, + size_t count, loff_t *pos) +{ + struct seq_file *s = file->private_data; + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + char lbuf[16] = { }; + u32 val; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &val)) + return -EINVAL; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME) + return -EINVAL; + rdev->cq_coalescing.buf_maxtime = val; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.normal_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.during_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE) + return -EINVAL; + rdev->cq_coalescing.en_ring_idle_mode = val; + break; + case BNXT_RE_COAL_CQ_ENABLE: + if (val > 1) + return -EINVAL; + rdev->cq_coalescing.enable = val; + break; + default: + return -EINVAL; + } + return count; +} + +static int cq_coal_cfg_show(struct seq_file *s, void *unused) +{ + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + u32 val = 0; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + val = rdev->cq_coalescing.buf_maxtime; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + val = rdev->cq_coalescing.normal_maxbuf; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + val = rdev->cq_coalescing.during_maxbuf; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + val = rdev->cq_coalescing.en_ring_idle_mode; + break; + case BNXT_RE_COAL_CQ_ENABLE: + val = rdev->cq_coalescing.enable; + break; + default: + return -EINVAL; + } + + seq_printf(s, "%u\n", val); + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg); + +static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->cq_coal_cfg); + kfree(rdev->cq_coal_cfg_params); +} + +static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params; + int i; + + if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2)) + return; + + dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL); + if (!dbg_cq_coal_params) + return; + + rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root); + rdev->cq_coal_cfg_params = dbg_cq_coal_params; + + for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) { + dbg_cq_coal_params->params[i].offset = i; + dbg_cq_coal_params->params[i].rdev = rdev; + debugfs_create_file(bnxt_re_cq_coal_str[i], + 0600, rdev->cq_coal_cfg, + &dbg_cq_coal_params->params[i], + &cq_coal_cfg_fops); + } +} + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; @@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) rdev->cc_config, tmp_params, &bnxt_re_cc_config_ops); } + + bnxt_re_init_cq_coal_debugfs(rdev); } void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) { + bnxt_re_cleanup_cq_coal_debugfs(rdev); debugfs_remove_recursive(rdev->qp_debugfs); debugfs_remove_recursive(rdev->cc_config); kfree(rdev->cc_config_params); diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h index 8f101df4e838..98f4620ef245 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.h +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -33,4 +33,23 @@ struct bnxt_re_cc_param { struct bnxt_re_dbg_cc_config_params { struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; }; + +struct bnxt_re_cq_coal_param { + struct bnxt_re_dev *rdev; + u32 offset; +}; + +enum bnxt_re_cq_coal_types { + BNXT_RE_COAL_CQ_BUF_MAXTIME, + BNXT_RE_COAL_CQ_NORMAL_MAXBUF, + BNXT_RE_COAL_CQ_DURING_MAXBUF, + BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE, + BNXT_RE_COAL_CQ_ENABLE, + BNXT_RE_COAL_CQ_MAX + +}; + +struct bnxt_re_dbg_cq_coal_params { + struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX]; +}; #endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 84ce3fce2826..f19b55c13d58 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, - BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE); + BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n"); goto fail; @@ -4027,7 +4028,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.hwq.level = PBL_LVL_MAX; mr->qplib_mr.total_size = -1; /* Infinte length */ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0, - PAGE_SIZE); + PAGE_SIZE, false); if (rc) goto fail_mr; @@ -4257,7 +4258,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, - umem_pgs, page_size); + umem_pgs, page_size, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc); rc = -EIO; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b13810572c2e..73003ad25ee8 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.enable = 1; rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ce90d3d834d4..c88f049136fc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); - if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) && + cq->coalescing->enable) { req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); coalescing |= ((cq->coalescing->buf_maxtime << CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b990d0c0ce1a..1b414a73b46d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param { u8 normal_maxbuf; u8 during_maxbuf; u8 en_ring_idle_mode; + u8 enable; }; #define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9ef581ed785c..408a34df2667 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; attr->max_pkey = 1; - attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge); if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); @@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, } int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size) + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_hwq_attr hwq_attr = {}; @@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + if (unified_mr) req.key = cpu_to_le32(mr->pd->id); req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); @@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + if (unified_mr) { mr->lkey = le32_to_cpu(resp.xid); mr->rkey = mr->lkey; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 147b5d9c0313..5a45c55c6464 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, bool block); int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size); + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr); int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr); int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, int max); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index dcdfe250bdbe..adeed7447e7b 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, { int err; - pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", pbl_addr, rdev->lldi.vr->pbl.start, pbl_size); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index b35f92e7d865..e4aef102dac0 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd) ppd->hfi1_wq = alloc_workqueue( "hfi%d_%d", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, dd->unit, pidx); if (!ppd->hfi1_wq) diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 370a5a8eaa71..6e0e3458d202 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) int opfn_init(void) { opfn_wq = alloc_workqueue("hfi_opfn", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); if (!opfn_wq) return -ENOMEM; diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index baf592e6f21b..d07ef02c5231 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,11 +4,13 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common ccflags-y += -I $(src) hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o hns_roce_hw_v2.o + hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 307c35888b30..0c1c32d23c88 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "hns_roce_device.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c new file mode 100644 index 000000000000..cc85f3ce1f3e --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#include <net/lag.h> +#include <net/bonding.h> +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" + +static DEFINE_XARRAY(roce_bond_xa); + +static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS); + + if (!ibdev) + return NULL; + + return container_of(ibdev, struct hns_roce_dev, ib_dev); +} + +static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) +{ + struct net_device *upper_dev; + + rcu_read_lock(); + upper_dev = netdev_master_upper_dev_get_rcu(net_dev); + dev_hold(upper_dev); + rcu_read_unlock(); + + return upper_dev; +} + +static int get_netdev_bond_slave_id(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) +{ + int i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) + if (net_dev == bond_grp->bond_func_info[i].net_dev) + return i; + + return -ENOENT; +} + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + struct hns_roce_bond_group *bond_grp; + struct net_device *upper_dev = NULL; + int i; + + if (!die_info) + return NULL; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return bond_grp; + if (bond_grp->upper_dev) { + upper_dev = get_upper_dev_from_ndev(net_dev); + if (bond_grp->upper_dev == upper_dev) { + dev_put(upper_dev); + return bond_grp; + } + dev_put(upper_dev); + } + } + + return NULL; +} + +static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + struct net_device *active_dev; + struct net_device *old_dev; + int i, ret = 0; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + rcu_read_lock(); + active_dev = + bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev)); + rcu_read_unlock(); + } else { + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + active_dev = bond_grp->bond_func_info[i].net_dev; + if (active_dev && + ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE) + break; + } + } + + if (!active_dev || i == ROCE_BOND_FUNC_MAX) + active_dev = get_hr_netdev(hr_dev, 0); + + old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1); + if (old_dev == active_dev) + goto out; + + ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1); + if (ret) { + dev_err(hr_dev->dev, "failed to set netdev for bond.\n"); + goto out; + } + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + if (old_dev) + roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev); + rdma_roce_rescan_port(&hr_dev->ib_dev, 1); + } +out: + dev_put(old_dev); + return ret; +} + +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED && + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return true; + + return false; +} + +static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u32 active_slave_map = 0; + u8 active_slave_num = 0; + bool active; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (!net_dev || !(bond_grp->slave_map & (1U << i))) + continue; + + active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ? + net_lag_port_dev_txable(net_dev) : + (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE); + if (active) { + active_slave_num++; + active_slave_map |= (1U << i); + } + } + + bond_grp->active_slave_num = active_slave_num; + bond_grp->active_slave_map = active_slave_map; +} + +static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + bond_grp->main_hr_dev = hr_dev; + hns_roce_bond_get_active_slave(bond_grp); + + return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); +} + +static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp, + u8 func_idx) +{ + struct hnae3_handle *handle; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle->priv) + hns_roce_bond_uninit_client(bond_grp, func_idx); +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch); + +static int switch_main_dev(struct hns_roce_bond_group *bond_grp, + u8 main_func_idx) +{ + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + u8 i; + + bond_grp->main_hr_dev = NULL; + hns_roce_bond_uninit_client(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if ((bond_grp->slave_map & (1U << i)) && net_dev) { + /* In case this slave is still being registered as + * a non-bonded PF, uninit it first and then re-init + * it as the main device. + */ + hns_roce_slave_uninit(bond_grp, i); + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) + return -ENODEV; + + return 0; +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch) +{ + struct hns_roce_dev *hr_dev = NULL; + struct hnae3_handle *handle; + u8 main_func_idx; + int ret; + + if (need_switch) { + main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + if (func_idx == main_func_idx) { + ret = switch_main_dev(bond_grp, main_func_idx); + if (ret == -ENODEV) + return NULL; + } + } + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle) { + if (handle->priv) + return handle->priv; + /* Prevent this device from being initialized as a bond device */ + if (need_switch) + bond_grp->bond_func_info[func_idx].net_dev = NULL; + hr_dev = hns_roce_bond_init_client(bond_grp, func_idx); + if (!hr_dev) + BOND_ERR_LOG("failed to init slave %u.\n", func_idx); + } + + return hr_dev; +} + +static struct hns_roce_die_info *alloc_die_info(int bus_num) +{ + struct hns_roce_die_info *die_info; + int ret; + + die_info = kzalloc(sizeof(*die_info), GFP_KERNEL); + if (!die_info) + return NULL; + + ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL)); + if (ret) { + kfree(die_info); + return NULL; + } + + mutex_init(&die_info->die_mutex); + + return die_info; +} + +static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num) +{ + mutex_destroy(&die_info->die_mutex); + xa_erase(&roce_bond_xa, bus_num); + kfree(die_info); +} + +static int alloc_bond_id(struct hns_roce_bond_group *bond_grp) +{ + u8 bus_num = bond_grp->bus_num; + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + int i; + + if (!die_info) { + die_info = alloc_die_info(bus_num); + if (!die_info) + return -ENOMEM; + } + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + if (die_info->bond_id_mask & BOND_ID(i)) + continue; + + die_info->bond_id_mask |= BOND_ID(i); + die_info->bgrps[i] = bond_grp; + bond_grp->bond_id = i; + + return 0; + } + + return -ENOSPC; +} + +static int remove_bond_id(int bus_num, u8 bond_id) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + + if (bond_id >= ROCE_BOND_NUM_MAX) + return -EINVAL; + + if (!die_info) + return -ENODEV; + + die_info->bond_id_mask &= ~BOND_ID(bond_id); + die_info->bgrps[bond_id] = NULL; + if (!die_info->bond_id_mask) + dealloc_die_info(die_info, bus_num); + + return 0; +} + +static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) +{ + struct hns_roce_dev *hr_dev; + int ret; + int i; + + for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) { + if (bond_grp->slave_map & (1 << i)) + hns_roce_slave_uninit(bond_grp, i); + } + + mutex_lock(&bond_grp->bond_mutex); + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + bond_grp->main_hr_dev = NULL; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1 << i)) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE set bond finished!\n"); + } +} + +static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) +{ + u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + struct hns_roce_dev *hr_dev; + u8 i; + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + goto out; + + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->main_hr_dev = NULL; + + hns_roce_slave_uninit(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) + bond_grp->main_hr_dev = hr_dev; + } + +out: + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to change RoCE bond slave state, ret = %d.\n", + ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave changestate finished!\n"); +} + +static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp) +{ + int ret; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1U << i)) { + if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn)) + continue; + hns_roce_slave_uninit(bond_grp, i); + } else { + hns_roce_slave_init(bond_grp, i, true); + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + bond_grp->bond_func_info[i].net_dev = NULL; + bond_grp->bond_func_info[i].handle = NULL; + } + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave change num finished!\n"); + } +} + +static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct hns_roce_v2_priv *priv; + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + int func_idx; + + bond_grp->slave_map = 0; + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + func_idx = get_netdev_bond_slave_id(net_dev, bond_grp); + if (func_idx < 0) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + continue; + func_idx = PCI_FUNC(hr_dev->pci_dev->devfn); + if (!bond_grp->bond_func_info[func_idx].net_dev) { + priv = hr_dev->priv; + bond_grp->bond_func_info[func_idx].net_dev = + net_dev; + bond_grp->bond_func_info[func_idx].handle = + priv->handle; + } + ib_device_put(&hr_dev->ib_dev); + } + + bond_grp->slave_map |= (1 << func_idx); + } + rcu_read_unlock(); +} + +static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + bool ret = true; + + if (!hr_dev) { + if (bond_grp && + get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return true; + else + return false; + } + + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) { + ret = false; + goto out; + } + + if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) { + ret = false; + goto out; + } + + if (bond_grp->bus_num != get_hr_bus_num(hr_dev)) + ret = false; + +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool check_slave_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + if (is_dev_bond_supported(bond_grp, net_dev)) { + slave_num++; + continue; + } + rcu_read_unlock(); + return false; + } + rcu_read_unlock(); + + return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX); +} + +static void hns_roce_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct hns_roce_bond_group *bond_grp = + container_of(delayed_work, struct hns_roce_bond_group, + bond_work); + enum hns_roce_bond_state bond_state; + bool bond_ready; + + mutex_lock(&bond_grp->bond_mutex); + bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev); + hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev); + bond_state = bond_grp->bond_state; + bond_grp->bond_ready = bond_ready; + mutex_unlock(&bond_grp->bond_mutex); + + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "bond work: bond_ready - %d, bond_state - %d.\n", + bond_ready, bond_state); + + if (!bond_ready) { + hns_roce_clear_bond(bond_grp); + return; + } + + switch (bond_state) { + case HNS_ROCE_BOND_NOT_BONDED: + hns_roce_set_bond(bond_grp); + /* In set_bond flow, we don't need to set bond netdev here as + * it has been done when bond_grp->main_hr_dev is registered. + */ + return; + case HNS_ROCE_BOND_SLAVE_CHANGESTATE: + hns_roce_slave_changestate(bond_grp); + break; + case HNS_ROCE_BOND_SLAVE_CHANGE_NUM: + hns_roce_slave_change_num(bond_grp); + break; + default: + return; + } + hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev); +} + +static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev, + struct net_device *upper_dev) +{ + bond_grp->upper_dev = upper_dev; + bond_grp->main_hr_dev = hr_dev; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->bond_ready = false; +} + +static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp) +{ + mutex_lock(&bond_grp->bond_mutex); + + cancel_delayed_work(&bond_grp->bond_work); + bond_grp->upper_dev = NULL; + bond_grp->main_hr_dev = NULL; + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; + bond_grp->slave_map = 0; + memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info)); + + mutex_unlock(&bond_grp->bond_mutex); +} + +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO; + if (ret) + BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE clear bond finished!\n"); + + hns_roce_detach_bond_grp(bond_grp); +} + +static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_bond_group *bond_grp_tmp; + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num); + return bond_grp_tmp == bond_grp; +} + +static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + mutex_lock(&bond_grp->bond_mutex); + + if (bond_grp->bond_ready && + bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE; + + mutex_unlock(&bond_grp->bond_mutex); +} + +static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + + if (!netif_is_lag_port(net_dev)) + return false; + + if (!lowerstate_event_filter(bond_grp, net_dev)) + return false; + + lowerstate_event_setting(bond_grp, info); + + return true; +} + +static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info) +{ + if (!bond_info) + return false; + + if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) + return false; + + if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH && + bond_info->hash_type > NETDEV_LAG_HASH_L23) + return false; + + return true; +} + +static void upper_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct netdev_lag_upper_info *bond_upper_info = NULL; + bool slave_inc = info->linking; + + if (slave_inc) + bond_upper_info = info->upper_info; + + if (bond_upper_info) { + bond_grp->tx_type = bond_upper_info->tx_type; + bond_grp->hash_type = bond_upper_info->hash_type; + } +} + +static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) { + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + slave_num++; + } + rcu_read_unlock(); + + return (slave_num > 1); +} + +static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info, + struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + if (!is_bond_setting_supported(bond_info)) + return false; + + return check_slave_support(bond_grp, upper_dev); +} + +static enum bond_support_type + check_bond_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev, + struct netdev_notifier_changeupper_info *info) +{ + bool bond_grp_exist = false; + bool support; + + if (upper_dev == bond_grp->upper_dev) + bond_grp_exist = true; + + if (!info->linking && !bond_grp_exist) + return BOND_NOT_SUPPORT; + + if (info->linking) + support = check_linking_bond_support(info->upper_info, bond_grp, + upper_dev); + else + support = check_unlinking_bond_support(bond_grp); + + if (support) + return BOND_SUPPORT; + + return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT; +} + +static bool upper_event_filter(struct netdev_notifier_changeupper_info *info, + struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct net_device *upper_dev = info->upper_dev; + struct hns_roce_bond_group *bond_grp_tmp; + struct hns_roce_dev *hr_dev; + bool ret = true; + u8 bus_num; + + if (!info->linking || + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return bond_grp->upper_dev == upper_dev; + + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + return false; + + bus_num = get_hr_bus_num(hr_dev); + if (bond_grp->bus_num != bus_num) { + ret = false; + goto out; + } + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp_tmp && bond_grp_tmp != bond_grp) + ret = false; +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + struct net_device *upper_dev = info->upper_dev; + enum bond_support_type support = BOND_SUPPORT; + struct hns_roce_dev *hr_dev; + int slave_id; + + if (!upper_dev || !netif_is_lag_master(upper_dev)) + return false; + + if (!upper_event_filter(info, bond_grp, net_dev)) + return false; + + mutex_lock(&bond_grp->bond_mutex); + support = check_bond_support(bond_grp, upper_dev, info); + if (support == BOND_NOT_SUPPORT) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev); + ib_device_put(&hr_dev->ib_dev); + } + + /* In the case of netdev being unregistered, the roce + * instance shouldn't be inited. + */ + if (net_dev->reg_state >= NETREG_UNREGISTERING) { + slave_id = get_netdev_bond_slave_id(net_dev, bond_grp); + if (slave_id >= 0) { + bond_grp->bond_func_info[slave_id].net_dev = NULL; + bond_grp->bond_func_info[slave_id].handle = NULL; + } + } + + if (support == BOND_SUPPORT) { + bond_grp->bond_ready = true; + if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM; + } + mutex_unlock(&bond_grp->bond_mutex); + if (support == BOND_SUPPORT) + upper_event_setting(bond_grp, info); + + return true; +} + +static int hns_roce_bond_event(struct notifier_block *self, + unsigned long event, void *ptr) +{ + struct hns_roce_bond_group *bond_grp = + container_of(self, struct hns_roce_bond_group, bond_nb); + bool changed = false; + + if (event == NETDEV_CHANGEUPPER) + changed = hns_roce_bond_upper_event(bond_grp, ptr); + if (event == NETDEV_CHANGELOWERSTATE) + changed = hns_roce_bond_lowerstate_event(bond_grp, ptr); + + if (changed) + schedule_delayed_work(&bond_grp->bond_work, HZ); + + return NOTIFY_DONE; +} + +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + int ret; + int i; + + if (xa_load(&roce_bond_xa, bus_num)) + return 0; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL); + if (!bond_grp) { + ret = -ENOMEM; + goto mem_err; + } + + mutex_init(&bond_grp->bond_mutex); + INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work); + + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; + bond_grp->bus_num = bus_num; + + ret = alloc_bond_id(bond_grp); + if (ret) { + dev_err(hr_dev->dev, + "failed to alloc bond ID, ret = %d.\n", ret); + goto alloc_id_err; + } + + bond_grp->bond_nb.notifier_call = hns_roce_bond_event; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to register bond nb, ret = %d.\n", ret); + goto register_nb_err; + } + bgrps[i] = bond_grp; + } + + return 0; + +register_nb_err: + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); +alloc_id_err: + mutex_destroy(&bond_grp->bond_mutex); + kvfree(bond_grp); +mem_err: + for (i--; i >= 0; i--) { + unregister_netdevice_notifier(&bgrps[i]->bond_nb); + cancel_delayed_work_sync(&bgrps[i]->bond_work); + remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id); + mutex_destroy(&bgrps[i]->bond_mutex); + kvfree(bgrps[i]); + } + return ret; +} + +void hns_roce_dealloc_bond_grp(void) +{ + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + unsigned long id; + int i; + + xa_for_each(&roce_bond_xa, id, die_info) { + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); + mutex_destroy(&bond_grp->bond_mutex); + kvfree(bond_grp); + } + } +} + +int hns_roce_bond_init(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + int ret; + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + + if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) { + ret = hns_roce_recover_bond(bond_grp, hr_dev); + if (ret) { + dev_err(hr_dev->dev, + "failed to recover RoCE bond, ret = %d.\n", ret); + return ret; + } + } + + return hns_roce_set_bond_netdev(bond_grp, hr_dev); +} + +void hns_roce_bond_suspend(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + /* + * Avoid duplicated processing when calling this function + * multiple times. + */ + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); + } + +out: + die_info->suspend_cnt++; + mutex_unlock(&die_info->die_mutex); +} + +void hns_roce_bond_resume(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i, ret; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + die_info->suspend_cnt--; + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) + dev_err(&handle->pdev->dev, + "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n", + bus_num, bond_grp->bond_id, ret); + } + +out: + mutex_unlock(&die_info->die_mutex); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h new file mode 100644 index 000000000000..98c295d78ca1 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#ifndef _HNS_ROCE_BOND_H +#define _HNS_ROCE_BOND_H + +#include <linux/netdevice.h> +#include <net/bonding.h> + +#define ROCE_BOND_FUNC_MAX 4 +#define ROCE_BOND_NUM_MAX 2 + +#define BOND_ID(id) BIT(id) + +#define BOND_ERR_LOG(fmt, ...) \ + pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__) + +enum { + BOND_MODE_1, + BOND_MODE_2_4, +}; + +enum hns_roce_bond_hashtype { + BOND_HASH_L2, + BOND_HASH_L34, + BOND_HASH_L23, +}; + +enum bond_support_type { + BOND_NOT_SUPPORT, + /* + * bond_grp already exists, but in the current + * conditions it's no longer supported + */ + BOND_EXISTING_NOT_SUPPORT, + BOND_SUPPORT, +}; + +enum hns_roce_bond_state { + HNS_ROCE_BOND_NOT_ATTACHED, + HNS_ROCE_BOND_NOT_BONDED, + HNS_ROCE_BOND_IS_BONDED, + HNS_ROCE_BOND_SLAVE_CHANGE_NUM, + HNS_ROCE_BOND_SLAVE_CHANGESTATE, +}; + +enum hns_roce_bond_cmd_type { + HNS_ROCE_SET_BOND, + HNS_ROCE_CHANGE_BOND, + HNS_ROCE_CLEAR_BOND, +}; + +struct hns_roce_func_info { + struct net_device *net_dev; + struct hnae3_handle *handle; +}; + +struct hns_roce_bond_group { + struct net_device *upper_dev; + struct hns_roce_dev *main_hr_dev; + u8 active_slave_num; + u32 slave_map; + u32 active_slave_map; + u8 bond_id; + u8 bus_num; + struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; + bool bond_ready; + enum hns_roce_bond_state bond_state; + enum netdev_lag_tx_type tx_type; + enum netdev_lag_hash hash_type; + struct mutex bond_mutex; + struct notifier_block bond_nb; + struct delayed_work bond_work; +}; + +struct hns_roce_die_info { + u8 bond_id_mask; + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct mutex die_mutex; + u8 suspend_cnt; +}; + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num); +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev); +void hns_roce_dealloc_bond_grp(void); +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); +int hns_roce_bond_init(struct hns_roce_dev *hr_dev); +void hns_roce_bond_suspend(struct hnae3_handle *handle); +void hns_roce_bond_resume(struct hnae3_handle *handle); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 06832c0ac055..318f18cf37aa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -33,6 +33,7 @@ #ifndef _HNS_ROCE_DEVICE_H #define _HNS_ROCE_DEVICE_H +#include <linux/pci.h> #include <rdma/ib_verbs.h> #include <rdma/hns-abi.h> #include "hns_roce_debugfs.h" @@ -153,6 +154,7 @@ enum { HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), + HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), }; @@ -177,6 +179,7 @@ enum hns_roce_instance_state { HNS_ROCE_STATE_INIT, HNS_ROCE_STATE_INITED, HNS_ROCE_STATE_UNINIT, + HNS_ROCE_STATE_BOND_UNINIT, }; enum { @@ -1167,6 +1170,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh) grh->traffic_class >> DSCP_SHIFT : grh->traffic_class; } +static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev, + u8 port) +{ + return hr_dev->iboe.netdevs[port]; +} + +static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev) +{ + return hr_dev->pci_dev->bus->number; +} + void hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar); @@ -1293,7 +1307,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); -void hns_roce_exit(struct hns_roce_dev *hr_dev); +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup); int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 63052c0e7613..2d6ae89e525b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,11 +43,13 @@ #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> +#include "hclge_main.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" #define CREATE_TRACE_POINTS #include "hns_roce_trace.h" @@ -1434,6 +1436,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, return ret; } +static enum hns_roce_opcode_type + get_bond_opcode(enum hns_roce_bond_cmd_type bond_type) +{ + switch (bond_type) { + case HNS_ROCE_SET_BOND: + return HNS_ROCE_OPC_SET_BOND_INFO; + case HNS_ROCE_CHANGE_BOND: + return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT; + case HNS_ROCE_CLEAR_BOND: + return HNS_ROCE_OPC_CLEAR_BOND_INFO; + default: + WARN(true, "Invalid bond type %d!\n", bond_type); + return HNS_ROCE_OPC_SET_BOND_INFO; + } +} + +static enum hns_roce_bond_hashtype + get_bond_hashtype(enum netdev_lag_hash netdev_hashtype) +{ + switch (netdev_hashtype) { + case NETDEV_LAG_HASH_L2: + return BOND_HASH_L2; + case NETDEV_LAG_HASH_L34: + return BOND_HASH_L34; + case NETDEV_LAG_HASH_L23: + return BOND_HASH_L23; + default: + WARN(true, "Invalid hash type %d!\n", netdev_hashtype); + return BOND_HASH_L2; + } +} + +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type) +{ + enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type); + struct hns_roce_bond_info *slave_info; + struct hns_roce_cmq_desc desc = {}; + int ret; + + slave_info = (struct hns_roce_bond_info *)desc.data; + hns_roce_cmq_setup_basic_desc(&desc, opcode, false); + + slave_info->bond_id = cpu_to_le32(bond_grp->bond_id); + if (bond_type == HNS_ROCE_CLEAR_BOND) + goto out; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_1); + if (bond_grp->active_slave_num != 1) + ibdev_warn(&bond_grp->main_hr_dev->ib_dev, + "active slave cnt(%u) in Mode 1 is invalid.\n", + bond_grp->active_slave_num); + } else { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4); + slave_info->hash_policy = + cpu_to_le32(get_bond_hashtype(bond_grp->hash_type)); + } + + slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num); + slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map); + slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map); + +out: + ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1); + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "cmq bond type(%d) failed, ret = %d.\n", + bond_type, ret); + + return ret; +} + static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, dma_addr_t base_addr, u8 cmd, unsigned long tag) { @@ -2275,6 +2350,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev) caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) << HNS_ROCE_CAP_FLAGS_EX_SHIFT; + if (hr_dev->is_vf) + caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND; + caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS); caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID); caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH); @@ -7067,7 +7145,7 @@ error_failed_kzalloc: } static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, - bool reset) + bool reset, bool bond_cleanup) { struct hns_roce_dev *hr_dev = handle->priv; @@ -7079,7 +7157,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - hns_roce_exit(hr_dev); + hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } @@ -7130,12 +7208,51 @@ reset_chk_err: static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) - return; + goto out; handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; - __hns_roce_hw_v2_uninit_instance(handle, reset); + __hns_roce_hw_v2_uninit_instance(handle, reset, true); + + handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; + +out: + hns_roce_bond_resume(handle); +} + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle; + int ret; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (!handle || !handle->client) + return NULL; + + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) + return NULL; + + return handle->priv; +} + +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle; + + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) + return; + + handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT; + + __hns_roce_hw_v2_uninit_instance(handle, false, false); handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } @@ -7144,6 +7261,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) { set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state); return 0; @@ -7174,6 +7294,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state)) { handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED; + hns_roce_bond_resume(handle); return 0; } @@ -7193,6 +7314,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) dev_info(dev, "reset done, RoCE client reinit finished.\n"); } + hns_roce_bond_resume(handle); return ret; } @@ -7204,7 +7326,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT; dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n"); msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY); - __hns_roce_hw_v2_uninit_instance(handle, false); + __hns_roce_hw_v2_uninit_instance(handle, false, false); return 0; } @@ -7240,6 +7362,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, if (linkup || !hr_dev) return; + /* For bond device, the link status depends on the upper netdev, + * and the upper device's link status depends on all the slaves' + * netdev but not only one. So bond device cannot get a correct + * link status from this path. + */ + if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev))) + return; + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); } @@ -7264,6 +7394,7 @@ static int __init hns_roce_hw_v2_init(void) static void __exit hns_roce_hw_v2_exit(void) { + hns_roce_dealloc_bond_grp(); hnae3_unregister_client(&hns_roce_hw_v2_client); hns_roce_cleanup_debugfs(); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e64a04d6f85b..285fe0875fac 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -35,6 +35,7 @@ #include <linux/bitops.h> #include "hnae3.h" +#include "hns_roce_bond.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 @@ -228,6 +229,9 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, + HNS_ROCE_OPC_SET_BOND_INFO = 0x8601, + HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602, + HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603, }; #define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 @@ -1465,7 +1469,23 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; }; +struct hns_roce_bond_info { + __le32 bond_id; + __le32 bond_mode; + __le32 active_slave_cnt; + __le32 active_slave_mask; + __le32 slave_mask; + __le32 hash_policy; +}; + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx); +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx); int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type); static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index f3607fe107a7..2f4864ab7d4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -32,7 +32,6 @@ */ #include <linux/acpi.h> #include <linux/module.h> -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -41,6 +40,7 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) return ret; } -static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port, - unsigned long event) +static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num, + enum ib_port_state *state) { + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + struct net_device *net_dev; + + net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num); + if (!net_dev) + return -ENODEV; + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) { + *state = ib_get_curr_port_state(bond_grp->upper_dev); + goto out; + } + } + + *state = ib_get_curr_port_state(net_dev); +out: + dev_put(net_dev); + return 0; +} + +static int handle_en_event(struct net_device *netdev, + struct hns_roce_dev *hr_dev, + u32 port, unsigned long event) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; struct device *dev = hr_dev->dev; - struct net_device *netdev; + enum ib_port_state curr_state; + struct ib_event ibevent; int ret = 0; - netdev = hr_dev->iboe.netdevs[port]; if (!netdev) { dev_err(dev, "can't find netdev on port(%u)!\n", port); return -ENODEV; } switch (event) { - case NETDEV_UP: - case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_CHANGEADDR: ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); break; + case NETDEV_UP: + case NETDEV_CHANGE: + ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); + if (ret) + return ret; + fallthrough; case NETDEV_DOWN: - /* - * In v1 engine, only support all ports closed together. - */ + if (!netif_is_lag_master(netdev)) + break; + curr_state = ib_get_curr_port_state(netdev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return 0; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port + 1; + ib_dispatch_event(&ibevent); break; default: dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event)); @@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct hns_roce_bond_group *bond_grp; struct hns_roce_ib_iboe *iboe = NULL; struct hns_roce_dev *hr_dev = NULL; + struct net_device *upper = NULL; int ret; u32 port; hr_dev = container_of(self, struct hns_roce_dev, iboe.nb); iboe = &hr_dev->iboe; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), + get_hr_bus_num(hr_dev)); + upper = bond_grp ? bond_grp->upper_dev : NULL; + } for (port = 0; port < hr_dev->caps.num_ports; port++) { - if (dev == iboe->netdevs[port]) { - ret = handle_en_event(hr_dev, port, event); + if ((!upper && dev == iboe->netdevs[port]) || + (upper && dev == upper)) { + ret = handle_en_event(dev, hr_dev, port, event); if (ret) return NOTIFY_DONE; break; @@ -148,12 +201,13 @@ static int hns_roce_netdev_event(struct notifier_block *self, static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev) { + struct net_device *net_dev; int ret; u8 i; for (i = 0; i < hr_dev->caps.num_ports; i++) { - ret = hns_roce_set_mac(hr_dev, i, - hr_dev->iboe.netdevs[i]->dev_addr); + net_dev = get_hr_netdev(hr_dev, i); + ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr); if (ret) return ret; } @@ -221,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, struct ib_port_attr *props) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); - struct device *dev = hr_dev->dev; struct net_device *net_dev; - unsigned long flags; enum ib_mtu mtu; u32 port; int ret; @@ -244,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, if (ret) ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret); - spin_lock_irqsave(&hr_dev->iboe.lock, flags); - - net_dev = hr_dev->iboe.netdevs[port]; + net_dev = ib_device_get_netdev(ib_dev, port_num); if (!net_dev) { - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - dev_err(dev, "find netdev %u failed!\n", port); + ibdev_err(ib_dev, "find netdev %u failed!\n", port); return -EINVAL; } mtu = iboe_get_mtu(net_dev->mtu); props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256; - props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ? - IB_PORT_ACTIVE : - IB_PORT_DOWN; + + dev_put(net_dev); + + ret = hns_roce_get_port_state(hr_dev, port_num, &props->state); + if (ret) { + ibdev_err(ib_dev, "failed to get port state.\n"); + return ret; + } + props->phys_state = props->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; - - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - return 0; } @@ -617,9 +669,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device, return num_counters; } -static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) +static void + hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev, + struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + int i; + + /* To avoid the loss of other slave devices when main_hr_dev + * is unregistered, re-initialize the remaining slaves before + * the bond resources cleanup. + */ + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && net_dev != get_hr_netdev(hr_dev, 0)) + hns_roce_bond_init_client(bond_grp, i); + } + + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, + bool bond_cleanup) { + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) + hns_roce_unregister_bond_cleanup(hr_dev, bond_grp); + } hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); @@ -708,11 +791,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = { static int hns_roce_register_device(struct hns_roce_dev *hr_dev) { - int ret; struct hns_roce_ib_iboe *iboe = NULL; - struct ib_device *ib_dev = NULL; struct device *dev = hr_dev->dev; + struct ib_device *ib_dev = NULL; + struct net_device *net_dev; unsigned int i; + int ret; iboe = &hr_dev->iboe; spin_lock_init(&iboe->lock); @@ -747,17 +831,38 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); - for (i = 0; i < hr_dev->caps.num_ports; i++) { - if (!hr_dev->iboe.netdevs[i]) - continue; - ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i], - i + 1); - if (ret) + dma_set_max_seg_size(dev, SZ_2G); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + ret = hns_roce_alloc_bond_grp(hr_dev); + if (ret) { + dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n", + get_hr_bus_num(hr_dev), ret); return ret; + } + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND && + hns_roce_bond_is_active(hr_dev)) { + ret = hns_roce_bond_init(hr_dev); + if (ret) { + dev_err(dev, "failed to init bond!\n"); + return ret; + } + ret = ib_register_device(ib_dev, "hns_bond_%d", dev); + } else { + for (i = 0; i < hr_dev->caps.num_ports; i++) { + net_dev = get_hr_netdev(hr_dev, i); + if (!net_dev) + continue; + + ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); + if (ret) + return ret; + } + ret = ib_register_device(ib_dev, "hns_%d", dev); } - dma_set_max_seg_size(dev, SZ_2G); - ret = ib_register_device(ib_dev, "hns_%d", dev); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; @@ -1157,10 +1262,10 @@ error_failed_alloc_dfx_cnt: return ret; } -void hns_roce_exit(struct hns_roce_dev *hr_dev) +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup) { hns_roce_unregister_debugfs(hr_dev); - hns_roce_unregister_device(hr_dev); + hns_roce_unregister_device(hr_dev, bond_cleanup); if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index d35cf59d0f43..225c3e328e0e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include "hns_roce_device.h" void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index bdd879ac12dd..d1640c5fbaab 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> @@ -1348,11 +1347,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_attr *attr, int attr_mask) { + struct net_device *net_dev; enum ib_mtu active_mtu; int p; p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port; - active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu); + net_dev = get_hr_netdev(hr_dev, p); + active_mtu = iboe_get_mtu(net_dev->mtu); if ((hr_dev->caps.max_mtu >= IB_MTU_2048 && attr->path_mtu > hr_dev->caps.max_mtu) || diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 1090051f493b..8a6efb6b9c9e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -3,7 +3,6 @@ * Copyright (c) 2018 Hisilicon Limited. */ -#include <linux/pci.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> #include "hns_roce_device.h" diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index c6a0a661d6e7..f4f4f92ba63a 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) iwpd = iwqp->iwpd; tagged_offset = (uintptr_t)iwqp->ietf_mem.va; ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len, - IB_ACCESS_LOCAL_WRITE, &tagged_offset); + IB_ACCESS_LOCAL_WRITE, &tagged_offset, false); if (IS_ERR(ibmr)) { ret = -ENOMEM; goto error; diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 4ef1c29032f7..ce5cf89c463c 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, __le64 *wqe; struct irdma_sc_cqp *cqp; u64 hdr; - struct irdma_sc_ceq *ceq; - int ret_code = 0; cqp = cq->dev->cqp; if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt) @@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - ret_code = irdma_sc_add_cq_ctx(ceq, cq); - - if (ret_code) - return ret_code; - wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); - if (!wqe) { - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); + if (!wqe) return -ENOMEM; - } set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); @@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) struct irdma_sc_cqp *cqp; __le64 *wqe; u64 hdr; - struct irdma_sc_ceq *ceq; cqp = cq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) return -ENOMEM; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); - set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); set_64bit_val(wqe, 40, cq->shadow_area_pa); @@ -3602,71 +3585,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, } /** - * irdma_sc_find_reg_cq - find cq ctx index - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq, - struct irdma_sc_cq *cq) -{ - u32 i; - - for (i = 0; i < ceq->reg_cq_size; i++) { - if (cq == ceq->reg_cq[i]) - return i; - } - - return IRDMA_INVALID_CQ_IDX; -} - -/** - * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - - if (ceq->reg_cq_size == ceq->elem_cnt) { - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - return -ENOMEM; - } - - ceq->reg_cq[ceq->reg_cq_size++] = cq; - - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - - return 0; -} - -/** - * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - u32 cq_ctx_idx; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq); - if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX) - goto exit; - - ceq->reg_cq_size--; - if (cq_ctx_idx != ceq->reg_cq_size) - ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size]; - ceq->reg_cq[ceq->reg_cq_size] = NULL; - -exit: - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); -} - -/** * irdma_sc_cqp_init - Initialize buffers for a control Queue Pair * @cqp: IWARP control queue pair pointer * @info: IWARP control queue pair init info pointer @@ -3950,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp) */ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) { + unsigned long flags; u64 temp_val; u16 sw_cq_sel; u8 arm_next_se; u8 arm_seq_num; + spin_lock_irqsave(&ccq->dev->cqp_lock, flags); get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val); sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val); arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val); @@ -3965,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) | FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1); set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); + spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); dma_wmb(); /* make sure shadow area is updated before arming */ @@ -4387,9 +4308,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, ceq->ceq_elem_pa = info->ceqe_pa; ceq->virtual_map = info->virtual_map; ceq->itr_no_expire = info->itr_no_expire; - ceq->reg_cq = info->reg_cq; - ceq->reg_cq_size = 0; - spin_lock_init(&ceq->req_cq_lock); ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0); ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0); ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL); @@ -4472,9 +4390,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) { struct irdma_sc_cqp *cqp; - if (ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq); - cqp = ceq->dev->cqp; cqp->process_cqp_sds = irdma_update_sds_noccq; @@ -4493,11 +4408,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) struct irdma_sc_dev *dev = ceq->dev; dev->ccq->vsi_idx = ceq->vsi_idx; - if (ceq->reg_cq) { - ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq); - if (ret_code) - return ret_code; - } ret_code = irdma_sc_ceq_create(ceq, scratch, true); if (!ret_code) @@ -4562,7 +4472,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) struct irdma_sc_cq *temp_cq; u8 polarity; u32 cq_idx; - unsigned long flags; do { cq_idx = 0; @@ -4583,11 +4492,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) } cq = temp_cq; - if (ceq->reg_cq) { - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_idx = irdma_sc_find_reg_cq(ceq, cq); - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - } IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) @@ -4731,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, u64 hdr; dev = aeq->dev; - if (dev->privileged) + + if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]); cqp = dev->cqp; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 7bad0e38786a..d1fc5726b979 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; - memset(info, 0, sizeof(*info)); info->add = add_port; info->port = accel_local_port; cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY; @@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf, if (action == IRDMA_ARP_ADD) { cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY; info = &cqp_info->in.u.add_arp_cache_entry.info; - memset(info, 0, sizeof(*info)); info->arp_index = (u16)arp_index; info->permanent = true; ether_addr_copy(info->mac_addr, mac_addr); @@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_qhash_table_entry.info; - memset(info, 0, sizeof(*info)); info->vsi = &iwdev->vsi; info->manage = mtype; info->entry_type = etype; diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c index 27b191f61caf..b49fd9cf2476 100644 --- a/drivers/infiniband/hw/irdma/icrdma_if.c +++ b/drivers/infiniband/hw/irdma/icrdma_if.c @@ -302,7 +302,8 @@ err_rt_init: err_ctrl_init: icrdma_deinit_interrupts(rf, cdev_info); err_init_interrupts: - kfree(iwdev->rf); + mutex_destroy(&rf->ah_tbl_lock); + kfree(rf); ib_dealloc_device(&iwdev->ibdev); return err; @@ -319,6 +320,9 @@ static void icrdma_remove(struct auxiliary_device *aux_dev) ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); icrdma_deinit_interrupts(iwdev->rf, cdev_info); + mutex_destroy(&iwdev->rf->ah_tbl_lock); + + kfree(iwdev->rf); pr_debug("INIT: Gen[%d] func[%d] device remove success\n", rdma_ver, PCI_FUNC(cdev_info->pdev->devfn)); diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c index 1bb42eb298ba..e1d6670d9396 100644 --- a/drivers/infiniband/hw/irdma/ig3rdma_if.c +++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c @@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf, ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info); if (ret) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return ret; } @@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf) { struct irdma_hw *hw = &rf->hw; + mutex_destroy(&rf->ah_tbl_lock); destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); kfree(hw->io_regs); iounmap(hw->rdma_reg.addr); } @@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf, err = ig3rdma_cfg_regions(&rf->hw, cdev_info); if (err) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return err; } diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 886b30da188a..baab61e424a2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src); u16 irdma_get_vlan_ipv4(u32 *addr); void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac); struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, - int acc, u64 *iova_start); + int acc, u64 *iova_start, bool dma_mr); int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw); void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, @@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, void (*callback_fcn)(struct irdma_cqp_request *cqp_request), void *cb_param); void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request); -bool irdma_cq_empty(struct irdma_cq *iwcq); int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event, void *ptr); int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index fa6325adaede..28dfad7f940c 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -506,12 +506,14 @@ exit: void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_pble_alloc *palloc) { - pble_rsrc->freedpbles += palloc->total_cnt; - if (palloc->level == PBLE_LEVEL_2) free_lvl2(pble_rsrc, palloc); else irdma_prm_return_pbles(&pble_rsrc->pinfo, &palloc->level1.chunkinfo); + + mutex_lock(&pble_rsrc->pble_mutex_lock); + pble_rsrc->freedpbles += palloc->total_cnt; pble_rsrc->stats_alloc_freed++; + mutex_unlock(&pble_rsrc->pble_mutex_lock); } diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 694e5a9ed15d..cee47ddbd1b5 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) ukqp->rq_size = rsrc->rq_size; IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size); - IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size); IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size); ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db; @@ -726,7 +725,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) struct irdma_sc_cqp *cqp; u64 hdr; struct irdma_ccq_cqe_info compl_info; - int status = 0; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0); @@ -756,16 +754,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16, 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); irdma_sc_cqp_post_sq(dev->cqp); - status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, - &compl_info); - if (!status) { - struct irdma_sc_ceq *ceq = dev->ceq[0]; - - if (ceq && ceq->reg_cq) - status = irdma_sc_add_cq_ctx(ceq, cq); - } - - return status; + return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, + &compl_info); } /** @@ -897,23 +887,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, struct irdma_puda_buf *buf = NULL; struct irdma_puda_buf *nextbuf = NULL; struct irdma_virt_mem *vmem; - struct irdma_sc_ceq *ceq; - ceq = vsi->dev->ceq[0]; switch (type) { case IRDMA_PUDA_RSRC_TYPE_ILQ: rsrc = vsi->ilq; vmem = &vsi->ilq_mem; vsi->ilq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; case IRDMA_PUDA_RSRC_TYPE_IEQ: rsrc = vsi->ieq; vmem = &vsi->ieq_mem; vsi->ieq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; default: ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n", diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index c1b8f81ea283..cab4896640a1 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -492,9 +492,6 @@ struct irdma_sc_ceq { u32 first_pm_pbl_idx; u8 polarity; u16 vsi_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_size; - spinlock_t req_cq_lock; /* protect access to reg_cq array */ bool virtual_map:1; bool tph_en:1; bool itr_no_expire:1; @@ -894,8 +891,6 @@ struct irdma_ceq_init_info { u8 tph_val; u16 vsi_idx; u32 first_pm_pbl_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_idx; }; struct irdma_aeq_init_info { diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index ce1ae10c30fc..f0846b800913 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) */ void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) { - u64 temp; - u32 hw_sq_tail; - u32 sw_sq_head; - - /* valid bit is written and loads completed before reading shadow */ - mb(); - - /* read the doorbell shadow area */ - get_64bit_val(qp->shadow_area, 0, &temp); - - hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp); - sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); - if (sw_sq_head != qp->initial_ring.head) { - if (sw_sq_head != hw_sq_tail) { - if (sw_sq_head > qp->initial_ring.head) { - if (hw_sq_tail >= qp->initial_ring.head && - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } else { - if (hw_sq_tail >= qp->initial_ring.head || - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } - } - } - - qp->initial_ring.head = qp->sq_ring.head; + dma_wmb(); + writel(qp->qp_id, qp->wqe_alloc_db); } /** @@ -194,6 +169,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id; qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; qp->sq_wrtrk_array[*wqe_idx].quanta = quanta; + qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled; return wqe; } @@ -1137,6 +1113,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, } /** + * irdma_uk_cq_empty - Check if CQ is empty + * @cq: hw cq + */ +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq) +{ + __le64 *cqe; + u8 polarity; + u64 qword3; + + if (cq->avoid_mem_cflct) + cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq); + else + cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq); + + get_64bit_val(cqe, 24, &qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + + return polarity != cq->polarity; +} + +/** * irdma_uk_cq_poll_cmpl - get cq completion info * @cq: hw cq * @info: cq poll information returned @@ -1287,6 +1284,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) { + unsigned long flags; + srq = qp->srq_uk; get_64bit_val(cqe, 8, &info->wr_id); @@ -1299,8 +1298,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } else { info->stag_invalid_set = false; } + spin_lock_irqsave(srq->lock, flags); IRDMA_RING_MOVE_TAIL(srq->srq_ring); + spin_unlock_irqrestore(srq->lock, flags); pring = &srq->srq_ring; + } else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) { u32 array_idx; @@ -1355,6 +1357,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid; if (!info->comp_status) info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len; + if (!qp->sq_wrtrk_array[wqe_idx].signaled) { + ret_code = -EFAULT; + goto exit; + } info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); IRDMA_RING_SET_TAIL(qp->sq_ring, wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta); @@ -1420,8 +1426,9 @@ exit: IRDMA_RING_MOVE_TAIL(cq->cq_ring); if (!cq->avoid_mem_cflct && ext_valid) IRDMA_RING_MOVE_TAIL(cq->cq_ring); - set_64bit_val(cq->shadow_area, 0, - IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); + if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq)) + set_64bit_val(cq->shadow_area, 0, + IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); } else { qword3 &= ~IRDMA_CQ_WQEIDX; qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail); @@ -1574,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, qp->conn_wqes = move_cnt; IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt); IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt); - IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt); } /** @@ -1719,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) qp->max_sq_frag_cnt = info->max_sq_frag_cnt; sq_ring_size = qp->sq_size << info->sq_shift; IRDMA_RING_INIT(qp->sq_ring, sq_ring_size); - IRDMA_RING_INIT(qp->initial_ring, sq_ring_size); if (info->first_sq_wq) { irdma_setup_connection_wqes(qp, info); qp->swqe_polarity = 1; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ab57f689827a..9eb7fd0b1cbf 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops { struct irdma_bind_window *op_info); }; +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq); int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info); void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, @@ -456,7 +457,6 @@ struct irdma_srq_uk { struct irdma_uk_attrs *uk_attrs; __le64 *shadow_area; struct irdma_ring srq_ring; - struct irdma_ring initial_ring; u32 srq_id; u32 srq_size; u32 max_srq_frag_cnt; @@ -465,6 +465,7 @@ struct irdma_srq_uk { u8 wqe_size; u8 wqe_size_multiplier; u8 deferred_flag; + spinlock_t *lock; }; struct irdma_srq_uk_init_info { @@ -482,7 +483,8 @@ struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; u16 quanta; - u8 reserved[2]; + u8 signaled; + u8 reserved[1]; }; struct irdma_qp_quanta { diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8b94d87b0192..cc2a12f735d3 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp, cqp_request->waiting = wait; refcount_set(&cqp_request->refcnt, 1); memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info)); + memset(&cqp_request->info, 0, sizeof(cqp_request->info)); return cqp_request; } @@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS; cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE; @@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY; cqp_info->post_sq = 1; cqp_info->in.u.qp_destroy.qp = qp; @@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER; cqp_info->post_sq = 1; cqp_info->in.u.stats_gather.info = pestat->gather_info; @@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.stats_manage.info = *stats_info; @@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.ws_node.info = *node_info; @@ -2357,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event) iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context); } -bool irdma_cq_empty(struct irdma_cq *iwcq) -{ - struct irdma_cq_uk *ukcq; - u64 qword3; - __le64 *cqe; - u8 polarity; - - ukcq = &iwcq->sc_cq.cq_uk; - if (ukcq->avoid_mem_cflct) - cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq); - else - cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); - get_64bit_val(cqe, 24, &qword3); - polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); - - return polarity != ukcq->polarity; -} - void irdma_remove_cmpls_list(struct irdma_cq *iwcq) { struct irdma_cmpl_gen *cmpl_node; @@ -2436,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk; struct irdma_ring *sq_ring = &qp->sq_ring; struct irdma_ring *rq_ring = &qp->rq_ring; + struct irdma_cq *iwscq = iwqp->iwscq; + struct irdma_cq *iwrcq = iwqp->iwrcq; struct irdma_cmpl_gen *cmpl; __le64 *sw_wqe; u64 wqe_qword; @@ -2443,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) bool compl_generated = false; unsigned long flags1; - spin_lock_irqsave(&iwqp->iwscq->lock, flags1); - if (irdma_cq_empty(iwqp->iwscq)) { + spin_lock_irqsave(&iwscq->lock, flags1); + if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2452,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); return; } @@ -2471,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) kfree(cmpl); continue; } - ibdev_dbg(iwqp->iwscq->ibcq.device, + ibdev_dbg(iwscq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id); - list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated); + list_add_tail(&cmpl->list, &iwscq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwscq); + irdma_comp_handler(iwscq); } else { - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } - spin_lock_irqsave(&iwqp->iwrcq->lock, flags1); - if (irdma_cq_empty(iwqp->iwrcq)) { + spin_lock_irqsave(&iwrcq->lock, flags1); + if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2496,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); return; } @@ -2508,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ; /* remove the RQ WR by moving RQ tail */ IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); - ibdev_dbg(iwqp->iwrcq->ibcq.device, + ibdev_dbg(iwrcq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id, wqe_idx); - list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated); + list_add_tail(&cmpl->list, &iwrcq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwrcq); + irdma_comp_handler(iwrcq); } else { - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c883c9ea5a83..6d9af41a2884 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev, irdma_fw_minor_ver(&rf->sc_dev); props->device_cap_flags = IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS; - props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3) + props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; props->vendor_id = pcidev->vendor; props->vendor_part_id = pcidev->device; @@ -771,7 +772,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->mac_valid = true; qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE; @@ -2029,6 +2029,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, struct irdma_pci_f *rf; struct irdma_cq_buf *cq_buf = NULL; unsigned long flags; + u8 cqe_size; int ret; iwdev = to_iwdev(ibcq->device); @@ -2045,7 +2046,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, return -EINVAL; if (!iwcq->user_mode) { - entries++; + entries += 2; if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) @@ -2053,6 +2054,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, if (entries & 1) entries += 1; /* cq size must be an even number */ + + cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32; + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; } info.cq_size = max(entries, 4); @@ -2306,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev, ukinfo->srq_size = depth >> shift; ukinfo->shadow_area = mem->va + ring_size; - info->shadow_area_pa = info->srq_pa + ring_size; info->srq_pa = mem->pa; + info->shadow_area_pa = info->srq_pa + ring_size; return 0; } @@ -2384,6 +2389,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq, info.vsi = &iwdev->vsi; info.pd = &iwpd->sc_pd; + iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock; err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info); if (err_code) goto free_dmem; @@ -2483,6 +2489,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, int err_code; int entries = attr->cqe; bool cqe_64byte_ena; + u8 cqe_size; err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev); if (err_code) @@ -2509,6 +2516,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, ukinfo->cq_id = cq_num; cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false; + cqe_size = cqe_64byte_ena ? 64 : 32; ukinfo->avoid_mem_cflct = cqe_64byte_ena; iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size; if (attr->comp_vector < rf->ceqs_count) @@ -2581,13 +2589,16 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_free_rsrc; } - entries++; + entries += 2; if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) entries *= 2; if (entries & 1) entries += 1; /* cq size must be an even number */ + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; + ukinfo->cq_size = entries; if (cqe_64byte_ena) @@ -3103,12 +3114,10 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.alloc_stag.info; - memset(info, 0, sizeof(*info)); info->page_size = PAGE_SIZE; info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; info->pd_id = iwpd->sc_pd.pd_id; info->total_len = iwmr->len; - info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; info->remote_access = true; cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG; cqp_info->post_sq = 1; @@ -3119,7 +3128,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, if (status) return status; - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return 0; } @@ -3253,7 +3262,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, cqp_info = &cqp_request->info; stag_info = &cqp_info->in.u.mr_reg_non_shared.info; - memset(stag_info, 0, sizeof(*stag_info)); stag_info->va = iwpbl->user_base; stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; stag_info->stag_key = (u8)iwmr->stag; @@ -3263,7 +3271,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; stag_info->pd_id = iwpd->sc_pd.pd_id; - stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; + stag_info->all_memory = iwmr->dma_mr; if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED) stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED; else @@ -3290,7 +3298,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); if (!ret) - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return ret; } @@ -3647,7 +3655,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) cqp_info = &cqp_request->info; info = &cqp_info->in.u.dealloc_stag.info; - memset(info, 0, sizeof(*info)); info->pd_id = iwpd->sc_pd.pd_id; info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S; info->mr = true; @@ -3663,7 +3670,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) if (status) return status; - iwmr->is_hwreg = 0; + iwmr->is_hwreg = false; return 0; } @@ -3786,9 +3793,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags, * @size: size of memory to register * @access: Access rights * @iova_start: start of virtual address for physical buffers + * @dma_mr: Flag indicating whether this region is a PD DMA MR */ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access, - u64 *iova_start) + u64 *iova_start, bool dma_mr) { struct irdma_device *iwdev = to_iwdev(pd->device); struct irdma_pbl *iwpbl; @@ -3805,6 +3813,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access iwpbl = &iwmr->iwpbl; iwpbl->iwmr = iwmr; iwmr->type = IRDMA_MEMREG_TYPE_MEM; + iwmr->dma_mr = dma_mr; iwpbl->user_base = *iova_start; stag = irdma_create_stag(iwdev); if (!stag) { @@ -3843,7 +3852,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc) { u64 kva = 0; - return irdma_reg_phys_mr(pd, 0, 0, acc, &kva); + return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true); } /** @@ -4078,7 +4087,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; - info.local_fence = info.read_fence; + info.local_fence = true; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; @@ -4505,7 +4514,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, } if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated))) + (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated))) ret = 1; spin_unlock_irqrestore(&iwcq->lock, flags); @@ -5204,7 +5213,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah, struct irdma_ah *parent_ah; int err; - if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) + if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) return -EINVAL; err = irdma_setup_ah(ibah, attr); @@ -5500,7 +5509,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) irdma_rt_deinit_hw(iwdev); if (!iwdev->is_vport) { irdma_ctrl_deinit_hw(iwdev->rf); - if (iwdev->rf->vchnl_wq) + if (iwdev->rf->vchnl_wq) { destroy_workqueue(iwdev->rf->vchnl_wq); + mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex); + } } } diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index ac8b38701835..aabbb3442098 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -111,7 +111,8 @@ struct irdma_mr { }; struct ib_umem *region; int access; - u8 is_hwreg; + bool is_hwreg:1; + bool dma_mr:1; u16 type; u32 page_cnt; u64 page_size; diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 12b481d138cf..03aacd526860 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) int mlx4_ib_cm_init(void) { - cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0); + cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0); if (!cm_wq) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8b506417ad2f..d31d7f3005c6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_table_in, in, other_vport)); MLX5_SET(destroy_flow_table_in, din, vport_number, MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, other_eswitch, + MLX5_GET(create_flow_table_in, in, other_eswitch)); + MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_table_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_table_in, din, table_type, MLX5_GET(create_flow_table_in, in, table_type)); MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); @@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_group_in, in, other_vport)); MLX5_SET(destroy_flow_group_in, din, vport_number, MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, other_eswitch, + MLX5_GET(create_flow_group_in, in, other_eswitch)); + MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_group_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_group_in, din, table_type, MLX5_GET(create_flow_group_in, in, table_type)); MLX5_SET(destroy_flow_group_in, din, table_id, @@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(set_fte_in, in, other_vport)); MLX5_SET(delete_fte_in, din, vport_number, MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, other_eswitch, + MLX5_GET(set_fte_in, in, other_eswitch)); + MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id, + MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id)); MLX5_SET(delete_fte_in, din, table_type, MLX5_GET(set_fte_in, in, table_type)); MLX5_SET(delete_fte_in, din, table_id, diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index b0f7663c24c1..d17823ce7f38 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); } -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, - struct mlx5_flow_namespace *ns, +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, - int priority, - int num_entries, int num_groups, - u32 flags, u16 vport) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - ft_attr.prio = priority; - ft_attr.max_fte = num_entries; - ft_attr.flags = flags; - ft_attr.vport = vport; - ft_attr.autogroup.max_num_groups = num_groups; - ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; enum mlx5_flow_namespace_type fn_type; struct mlx5_ib_flow_prio *prio; @@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, num_entries, max_table_size); ft = prio->flow_table; - if (!ft) - return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags, 0); + if (ft) + return prio; - return prio; + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + return _get_prio(ns, prio, &ft_attr); } enum { @@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, enum mlx5_ib_optional_counter_type type) { enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; struct mlx5_flow_namespace *ns; struct mlx5_ib_flow_prio *prio; @@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, if (prio->flow_table) return 0; - prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) return PTR_ERR(prio); @@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; int priority, i, err, spec_num; struct mlx5_flow_act flow_act = {}; @@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, if (err) goto free; - prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) { err = PTR_ERR(prio); goto put_prio; @@ -1872,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, u32 *flags, u16 *vport_idx, u16 *vport, struct mlx5_core_dev **ft_mdev, - u32 ib_port) + u32 ib_port, u16 *esw_owner_vhca_id) { struct mlx5_core_dev *esw_mdev; @@ -1886,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, return -EINVAL; esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); - if (esw_mdev != dev->mdev) - return -EOPNOTSUPP; + if (esw_mdev != dev->mdev) { + if (!MLX5_CAP_ADV_RDMA(dev->mdev, + rdma_transport_manager_other_eswitch)) + return -EOPNOTSUPP; + *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH; + *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id); + } *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; *ft_mdev = esw_mdev; @@ -1903,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, bool mcast, u32 ib_port) { struct mlx5_core_dev *ft_mdev = dev->mdev; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; + u16 esw_owner_vhca_id = 0; int max_table_size = 0; u16 vport_idx = 0; bool esw_encap; @@ -1966,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return ERR_PTR(-EINVAL); ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, &vport_idx, &vport, - &ft_mdev, ib_port); + &ft_mdev, ib_port, + &esw_owner_vhca_id); if (ret) return ERR_PTR(ret); @@ -2026,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, if (prio->flow_table) return prio; - return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags, vport); + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.vport = vport; + ft_attr.esw_owner_vhca_id = esw_owner_vhca_id; + ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES; + return _get_prio(ns, prio, &ft_attr); } static struct mlx5_ib_flow_handler * diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index cc8859d3c2f5..bbecca405171 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports) } } +static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner, + struct mlx5_core_dev *new_owner) +{ + int ret; + + if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) || + !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) || + !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch)) + return 0; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_TX); + if (ret) + return ret; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_RX); + if (ret) { + mlx5_fs_set_root_dev(cur_owner, cur_owner, + FS_FT_RDMA_TRANSPORT_TX); + return ret; + } + + return 0; +} + +static void mlx5_ib_release_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int i, ret; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev); + WARN_ON_ONCE(ret); + } +} + +static int mlx5_ib_take_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int ret; + int i; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, dev); + if (ret) { + mlx5_ib_release_transport(dev); + return ret; + } + } + + return 0; +} + static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { @@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); + if (mlx5_lag_is_shared_fdb(dev)) { + ret = mlx5_ib_take_transport(lag_master); + if (ret) + return ret; + } + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, mlx5_core_net(lag_master)); - if (!ibdev) - return -ENOMEM; + if (!ibdev) { + ret = -ENOMEM; + goto release_transport; + } ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port), GFP_KERNEL); @@ -127,6 +192,10 @@ fail_add: kfree(ibdev->port); fail_port: ib_dealloc_device(&ibdev->ib_dev); +release_transport: + if (mlx5_lag_is_shared_fdb(lag_master)) + mlx5_ib_release_transport(lag_master); + return ret; } @@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) esw = peer_mdev->priv.eswitch; mlx5_eswitch_unregister_vport_reps(esw, REP_IB); } + mlx5_ib_release_transport(mdev); } __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 90daa58126f4..40284bbb45d6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_XDR; break; + case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 0e8ae85af5a6..e71ee3d52eb0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -97,33 +97,28 @@ struct mlx5_pagefault { * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) -#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) -#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) -#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) -#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) - -#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT - static u64 mlx5_imr_ksm_entries; +static u64 mlx5_imr_mtt_entries; +static u64 mlx5_imr_mtt_size; +static u8 mlx5_imr_mtt_shift; +static u8 mlx5_imr_ksm_page_shift; -static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, +static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; - struct mlx5_klm *end = pklm + nentries; - int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + struct mlx5_ksm *end = pksm + nentries; + u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? cpu_to_be32(imr->null_mmkey.key) : mr_to_mdev(imr)->mkeys.null_mkey; u64 va = - MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++, va += step) { - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = key; - pklm->va = cpu_to_be64(va); + for (; pksm != end; pksm++, idx++, va += step) { + pksm->key = key; + pksm->va = cpu_to_be64(va); } return; } @@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++, va += step) { + for (; pksm != end; pksm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); if (mtt) { - pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); + pksm->key = cpu_to_be32(mtt->ibmr.lkey); + pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); } else { - pklm->key = key; - pklm->va = cpu_to_be64(va); + pksm->key = key; + pksm->va = cpu_to_be64(va); } } } @@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { - populate_klm(xlt, idx, nentries, mr, flags); + populate_ksm(xlt, idx, nentries, mr, flags); return 0; } else { return populate_mtt(xlt, idx, nentries, mr, flags); @@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) mutex_lock(&odp_imr->umem_mutex); mlx5r_umr_update_xlt(mr->parent, - ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, + ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); mlx5_ib_dereg_mr(&mr->ibmr, NULL); @@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; struct mlx5_ib_mr *imr = mr->parent; /* @@ -265,7 +259,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); - queue_work(system_unbound_wq, &mr->odp_destroy.work); + queue_work(system_dfl_wq, &mr->odp_destroy.work); } static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, @@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && MLX5_CAP_GEN(dev->mdev, null_mkey) && MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && - !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && + mlx5_imr_ksm_entries != 0 && + !(mlx5_imr_ksm_page_shift > + get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; } @@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, int err; odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), - idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + idx * mlx5_imr_mtt_size, + mlx5_imr_mtt_size, &mlx5_mn_ops); if (IS_ERR(odp)) return ERR_CAST(odp); mr = mlx5_mr_cache_alloc(dev, imr->access_flags, MLX5_MKC_ACCESS_MODE_MTT, - MLX5_IMR_MTT_ENTRIES); + mlx5_imr_mtt_entries); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; + mr->ibmr.iova = idx * mlx5_imr_mtt_size; mr->parent = imr; odp->private = mr; @@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_set(&mr->mmkey.usecount, 2); err = mlx5r_umr_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, + mlx5_imr_mtt_entries, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) return ERR_PTR(-EOPNOTSUPP); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); @@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, + mlx5_imr_ksm_page_shift, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, struct ib_umem_odp *odp_imr, u64 user_va, size_t bcnt, u32 *bytes_mapped, u32 flags) { - unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; unsigned long upd_start_idx = end_idx + 1; unsigned long upd_len = 0; unsigned long npages = 0; int err; int ret; - if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || + mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) return -EFAULT; /* Fault each child mr that intersects with our interval. */ while (bcnt) { - unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = user_va >> mlx5_imr_mtt_shift; struct ib_umem_odp *umem_odp; struct mlx5_ib_mr *mtt; u64 len; @@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init(void) { + u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; + u8 mlx5_imr_mtt_bits; + + /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ + if (log_va_pages <= 48 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 30; + /* 56 is x86-64, 5-level paging */ + else if (log_va_pages <= 56 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 34; + else + return 0; + + mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); + mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; + mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - - MLX5_IMR_MTT_BITS); + mlx5_imr_mtt_bits); + mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; return 0; } @@ -2093,6 +2106,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, destroy_prefetch_work(work); return rc; } - queue_work(system_unbound_wq, &work->work); + queue_work(system_dfl_wq, &work->work); return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 88724d15705d..69af20790481 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS || + rate == IB_RATE_1600_GBPS) return 0; - if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS) return -EINVAL; stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0ca2743f1075..e7835ca70e2b 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -518,7 +518,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) */ int rvt_driver_cq_init(void) { - comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE, + comp_vector_wq = alloc_workqueue("%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_PERCPU, 0, "rdmavt_cq"); if (!comp_vector_wq) return -ENOMEM; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index bcb97b3ea58a..b1df05238848 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -452,7 +452,6 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng length -= bytes; iova += bytes; - page_offset = 0; } return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index ac0183a2ff7a..0195d361e5e3 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -20,6 +20,54 @@ static struct rxe_recv_sockets recv_sockets; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key rxe_recv_sk_key[2]; +static struct lock_class_key rxe_recv_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void rxe_reclassify_recv_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-RXE-RECV", + &rxe_recv_slock_key[0], + "sk_lock-AF_INET-RDMA-RXE-RECV", + &rxe_recv_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-RXE-RECV", + &rxe_recv_slock_key[1], + "sk_lock-AF_INET6-RDMA-RXE-RECV", + &rxe_recv_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, @@ -192,6 +240,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); + rxe_reclassify_recv_socket(sock); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index f58e3ec6252f..ae71812bea82 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -358,7 +358,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, length -= bytes; iova += bytes; - page_offset = 0; } mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 95f1c1c2949d..845bdd03ca28 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -15,6 +15,54 @@ #include "rxe_queue.h" #include "rxe_task.h" +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key rxe_send_sk_key[2]; +static struct lock_class_key rxe_send_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void rxe_reclassify_send_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-RXE-SEND", + &rxe_send_slock_key[0], + "sk_lock-AF_INET-RDMA-RXE-SEND", + &rxe_send_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-RXE-SEND", + &rxe_send_slock_key[1], + "sk_lock-AF_INET6-RDMA-RXE-SEND", + &rxe_send_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { @@ -244,6 +292,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; + rxe_reclassify_send_socket(qp->sk); qp->sk->sk->sk_user_data = qp; /* pick a source UDP port number for this QP based on diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 3661cb627d28..2a234f26ac10 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -171,7 +171,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, udata, mi, &srq->rq.producer_lock, &srq->rq.consumer_lock); if (err) - goto err_free; + return err; srq->rq.max_wr = attr->max_wr; } @@ -180,11 +180,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, srq->limit = attr->srq_limit; return 0; - -err_free: - rxe_queue_cleanup(q); - srq->rq.queue = NULL; - return err; } void rxe_srq_cleanup(struct rxe_pool_elem *elem) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index eb0bd4f79a85..1d3de8209bfa 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -39,6 +39,55 @@ static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * lockdep can detect false positive circular dependencies + * when there are user-space socket API users or in kernel + * users switching between a tcp and rdma transport. + * Maybe also switching between siw and rxe may cause + * problems as per default sockets are only classified + * by family and not by ip protocol. And there might + * be different locks used between the application + * and the low level sockets. + * + * Problems were seen with ksmbd.ko and cifs.ko, + * switching transports, use git blame to find + * more details. + */ +static struct lock_class_key siw_sk_key[2]; +static struct lock_class_key siw_slock_key[2]; +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void siw_reclassify_socket(struct socket *sock) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, + "slock-AF_INET-RDMA-SIW", + &siw_slock_key[0], + "sk_lock-AF_INET-RDMA-SIW", + &siw_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, + "slock-AF_INET6-RDMA-SIW", + &siw_slock_key[1], + "sk_lock-AF_INET6-RDMA-SIW", + &siw_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ +} + static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); @@ -1394,6 +1443,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; + siw_reclassify_socket(s); /* * NOTE: For simplification, connect() is called in blocking @@ -1770,6 +1820,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; + siw_reclassify_socket(s); /* * Allow binding local port when still in TIME_WAIT from last close. diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 2e3c0516ce8f..dc531fad73de 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -1029,7 +1029,7 @@ static int __init iser_init(void) mutex_init(&ig.connlist_mutex); INIT_LIST_HEAD(&ig.connlist); - release_wq = alloc_workqueue("release workqueue", 0, 0); + release_wq = alloc_workqueue("release workqueue", WQ_PERCPU, 0); if (!release_wq) { iser_err("failed to allocate release workqueue\n"); err = -ENOMEM; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 42977a5326ee..af811d060cc8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2613,7 +2613,7 @@ static struct iscsit_transport iser_target_transport = { static int __init isert_init(void) { - isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0); + isert_login_wq = alloc_workqueue("isert_login_wq", WQ_PERCPU, 0); if (!isert_login_wq) { isert_err("Unable to allocate isert_login_wq\n"); return -ENOMEM; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index ef4abdea3c2d..9ecc6343455d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1450,7 +1450,7 @@ err_free_chunks: kfree(srv->chunks); err_free_srv: - kfree(srv); + put_device(&srv->dev); return ERR_PTR(-ENOMEM); } diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..99095645134f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE sizes at both stage-1 and stage-2, as well as address spaces up to 48-bits in size. -config IOMMU_IO_PGTABLE_LPAE_SELFTEST - bool "LPAE selftests" - depends on IOMMU_IO_PGTABLE_LPAE +config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST + tristate "KUnit tests for LPAE" + depends on IOMMU_IO_PGTABLE_LPAE && KUNIT + default KUNIT_ALL_TESTS help - Enable self-tests for LPAE page table allocator. This performs - a series of page-table consistency checks during boot. + Enable kunit tests for LPAE page table allocator. This performs + a series of page-table consistency checks. If unsure, say N here. @@ -247,7 +248,7 @@ config SUN50I_IOMMU config TEGRA_IOMMU_SMMU bool "NVIDIA Tegra SMMU Support" - depends on ARCH_TEGRA + depends on ARCH_TEGRA || COMPILE_TEST depends on TEGRA_AHB depends on TEGRA_MC select IOMMU_API @@ -384,3 +385,5 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. endif # IOMMU_SUPPORT + +source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 355294fa9033..8e8843316c4b 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -3,6 +3,7 @@ obj-y += arm/ iommufd/ obj-$(CONFIG_AMD_IOMMU) += amd/ obj-$(CONFIG_INTEL_IOMMU) += intel/ obj-$(CONFIG_RISCV_IOMMU) += riscv/ +obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o @@ -12,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o +obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o obj-$(CONFIG_IOMMU_IOVA) += iova.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index ecef69c11144..f2acf471cb5d 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -11,10 +11,13 @@ config AMD_IOMMU select MMU_NOTIFIER select IOMMU_API select IOMMU_IOVA - select IOMMU_IO_PGTABLE select IOMMU_SVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_AMDV1 + select IOMMU_PT_X86_64 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 59c04a67f398..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 9b4b589a54b5..25044d28f28a 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); * the IOMMU used by this driver. */ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index a698a2e7ce2a..78b1c44bd6b5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/irqreturn.h> -#include <linux/io-pgtable.h> +#include <linux/generic_pt/iommu.h> /* * Maximum number of IOMMUs supported @@ -247,6 +247,10 @@ #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) +#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ +#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) +#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ +#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ @@ -337,76 +341,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -436,12 +371,6 @@ /* DTE[128:179] | DTE[184:191] */ #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 #define IOMMU_PROT_IW 0x02 @@ -534,19 +463,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl.cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -554,14 +470,6 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - seqcount_t seqcount; /* Protects root/mode update */ - struct io_pgtable pgtbl; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { PD_MODE_NONE, PD_MODE_V1, @@ -589,10 +497,13 @@ struct pdom_iommu_info { * independent of their use. */ struct protection_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + struct pt_iommu_x86_64 amdv2; + }; struct list_head dev_list; /* List of all devices in this domain */ - struct iommu_domain domain; /* generic domain handle used by - iommu core code */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ @@ -602,6 +513,9 @@ struct protection_domain { struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ }; +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); /* * This structure contains information about one PCI segment in the system. diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 10fa217a7119..20b04996441d 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (ret) return ret; - if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) { + if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { iommu->dbg_mmio_offset = -1; return -EINVAL; } diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f2991c11867c..4f4d4955269e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); if (alloc_dev_table(pci_seg)) - return NULL; + goto err_free_pci_seg; if (alloc_alias_table(pci_seg)) - return NULL; + goto err_free_dev_table; if (alloc_rlookup_table(pci_seg)) - return NULL; + goto err_free_alias_table; return pci_seg; + +err_free_alias_table: + free_alias_table(pci_seg); +err_free_dev_table: + free_dev_table(pci_seg); +err_free_pci_seg: + list_del(&pci_seg->list); + kfree(pci_seg); + return NULL; } static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id, diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index 70c2f5b1631b..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> -#include <linux/seqlock.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - iommu_pages_list_add(freelist, p); - } - - iommu_pages_list_add(freelist, pt); -} - -static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - iommu_pages_list_add(freelist, root); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned int page_size_level, - gfp_t gfp) -{ - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - struct protection_domain *domain = - container_of(pgtable, struct protection_domain, iop); - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(pgtable->mode) && - pgtable->mode - 1 >= page_size_level) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) - goto out; - - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - - write_seqcount_begin(&pgtable->seqcount); - pgtable->root = pte; - pgtable->mode += 1; - write_seqcount_end(&pgtable->seqcount); - - amd_iommu_update_and_flush_device_table(domain); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_pages(pte); - - return ret; -} - -static u64 *alloc_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - unsigned long last_addr = address + (page_size - 1); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned int seqcount; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(pgtable, last_addr, - PAGE_SIZE_LEVEL(page_size), gfp)) - return NULL; - } - - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, - SZ_4K); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - unsigned int seqcount; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, - struct iommu_pages_list *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!iommu_pages_list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > amd_iommu_hpt_level); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - pgtable->root = - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->root) - return NULL; - pgtable->mode = PAGE_MODE_3_LEVEL; - seqcount_init(&pgtable->seqcount); - - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index b47941353ccb..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - * Author: Vasant Hegde <vasant.hegde@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_pages(p); - } - - iommu_free_pages(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_pages(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -ENOMEM; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&pdom->lock, flags); - amd_iommu_domain_flush_pages(pdom, o_iova, size); - spin_unlock_irqrestore(&pdom->lock, flags); - } - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - cfg->ias = ias; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2e1865daa1ce..9f1d56a5e145 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -30,7 +30,6 @@ #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/percpu.h> -#include <linux/io-pgtable.h> #include <linux/cc_platform.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> @@ -41,9 +40,9 @@ #include <asm/gart.h> #include <asm/dma.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> #include "amd_iommu.h" -#include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -60,7 +59,6 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; @@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1; */ DEFINE_IDA(pdom_ids); -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data); + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level); + +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level); static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); /**************************************************************************** * @@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void dump_command_buffer(struct amd_iommu *iommu) +{ + struct iommu_cmd *cmd; + u32 head, tail; + int i; + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), + MMIO_CMD_BUFFER_TAIL(tail)); + + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], + cmd->data[3]); + } +} + static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; @@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) } if (i == LOOP_TIMEOUT) { - pr_alert("Completion-Wait loop timed out\n"); + + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); + + if (amd_iommu_dump) + DO_ONCE_LITE(dump_command_buffer, iommu); + return -EIO; } @@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } -/* Flush the not present cache if it exists */ -static void domain_flush_np_cache(struct protection_domain *domain, - dma_addr_t iova, size_t size) -{ - if (unlikely(amd_iommu_np_cache)) { - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_pages(domain, iova, size); - spin_unlock_irqrestore(&domain->lock, flags); - } -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - lockdep_assert_held(&domain->lock); - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); - - set_dte_entry(iommu, dev_data); - clone_aliases(iommu, dev_data->dev); - } - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data); - - domain_flush_complete(domain); -} - int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) { struct iommu_dev_data *dev_data; @@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, } static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data) + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level) { u16 domid; u32 old_domid; @@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu, struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; + struct pt_iommu_amdv1_hw_info pt_info; + + make_clear_dte(dev_data, dte, &new); if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; - else + else { domid = domain->id; - make_clear_dte(dev_data, dte, &new); - - if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] |= iommu_virt_to_phys(domain->iop.root); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { + /* + * When updating the IO pagetable, the new top and level + * are provided as parameters. For other operations i.e. + * device attach, retrieve the current pagetable info + * via the IOMMU PT API. + */ + if (top_paddr) { + pt_info.host_pt_root = top_paddr; + pt_info.mode = top_level + 1; + } else { + WARN_ON(top_paddr || top_level); + pt_iommu_amdv1_hw_info(&domain->amdv1, + &pt_info); + } - new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; + new.data[0] |= __sme_set(pt_info.host_pt_root) | + (pt_info.mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + } + } new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; @@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); if (set) - set_dte_entry(iommu, dev_data); + set_dte_entry(iommu, dev_data, 0, 0); else clear_dte_entry(iommu, dev_data); @@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int max_pasids = dev_data->max_pasids; + struct pt_iommu_x86_64_hw_info pt_info; int ret = 0; /* @@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, if (!pdom_is_v2_pgtbl_mode(pdom)) return ret; - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); if (ret) free_gcr3_table(&dev_data->gcr3_info); @@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void) return domain; } -static int pdom_setup_pgtable(struct protection_domain *domain, - struct device *dev) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + if (amd_iommu_hatdis) + return false; + + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) { - struct io_pgtable_ops *pgtbl_ops; - enum io_pgtable_fmt fmt; + struct protection_domain *pdom = + container_of(iommupt, struct protection_domain, iommu); - switch (domain->pd_mode) { - case PD_MODE_V1: - fmt = AMD_IOMMU_V1; - break; - case PD_MODE_V2: - fmt = AMD_IOMMU_V2; - break; - case PD_MODE_NONE: - WARN_ON_ONCE(1); - return -EPERM; + return &pdom->lock; +} + +/* + * Update all HW references to the domain with a new pgtable configuration. + */ +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ + struct protection_domain *pdom = + container_of(iommu_table, struct protection_domain, iommu); + struct iommu_dev_data *dev_data; + + lockdep_assert_held(&pdom->lock); + + /* Update the DTE for all devices attached to this domain */ + list_for_each_entry(dev_data, &pdom->dev_list, list) { + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); + + /* Update the HW references with the new level and top ptr */ + set_dte_entry(iommu, dev_data, top_paddr, top_level); + clone_aliases(iommu, dev_data->dev); } - domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); - pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); - if (!pgtbl_ops) - return -ENOMEM; + list_for_each_entry(dev_data, &pdom->dev_list, list) + device_flush_dte(dev_data); + + domain_flush_complete(pdom); +} + +/* + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non + * present caching (like hypervisor shadowing). + */ +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + struct protection_domain *domain = to_pdomain(dom); + unsigned long flags; + if (likely(!amd_iommu_np_cache)) + return 0; + + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_pages(domain, iova, size); + spin_unlock_irqrestore(&domain->lock, flags); return 0; } -static inline u64 dma_max_address(enum protection_domain_mode pgtable) +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { - if (pgtable == PD_MODE_V1) - return PM_LEVEL_SIZE(amd_iommu_hpt_level); + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - /* - * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page - * Translation" shows that the V2 table sign extends the top of the - * address space creating a reserved region in the middle of the - * translation, just like the CPU does. Further Vasant says the docs are - * incomplete and this only applies to non-zero PASIDs. If the AMDv2 - * page table is assigned to the 0 PASID then there is no sign extension - * check. - * - * Since the IOMMU must have a fixed geometry, and the core code does - * not understand sign extended addressing, we have to chop off the high - * bit to get consistent behavior with attachments of the domain to any - * PASID. - */ - return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_all(dom); + spin_unlock_irqrestore(&dom->lock, flags); } -static bool amd_iommu_hd_support(struct amd_iommu *iommu) +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - if (amd_iommu_hatdis) - return false; + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - return iommu && (iommu->features & FEATURE_HDSUP); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); + spin_unlock_irqrestore(&dom->lock, flags); + iommu_put_pages_list(&gather->freelist); } -static struct iommu_domain * -do_iommu_domain_alloc(struct device *dev, u32 flags, - enum protection_domain_mode pgtable) +static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { + .get_top_lock = amd_iommu_get_top_lock, + .change_top = amd_iommu_change_top, +}; + +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = amd_iommu_set_dirty_tracking, +}; + +static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, + u32 flags) { - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); + struct pt_iommu_amdv1_cfg cfg = {}; struct protection_domain *domain; int ret; + if (amd_iommu_hatdis) + return ERR_PTR(-EOPNOTSUPP); + domain = protection_domain_alloc(); if (!domain) return ERR_PTR(-ENOMEM); - domain->pd_mode = pgtable; - ret = pdom_setup_pgtable(domain, dev); + domain->pd_mode = PD_MODE_V1; + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; + domain->iommu.nid = dev_to_node(dev); + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + domain->domain.dirty_ops = &amdv1_dirty_ops; + + /* + * Someday FORCE_COHERENCE should be set by + * amd_iommu_enforce_cache_coherency() like VT-d does. + */ + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + + /* + * AMD's IOMMU can flush as many pages as necessary in a single flush. + * Unless we run in a virtual machine, which can be inferred according + * to whether "non-present cache" is on, it is probably best to prefer + * (potentially) too extensive TLB flushing (i.e., more misses) over + * multiple TLB flushes (i.e., more flushes). For virtual machines the + * hypervisor needs to synchronize the host IOMMU PTEs with those of + * the guest, and the trade-off is different: unnecessary TLB flushes + * should be avoided. + */ + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + cfg.common.hw_max_vasz_lg2 = + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.starting_level = 2; + domain->domain.ops = &amdv1_ops; + + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); if (ret) { - pdom_id_free(domain->id); - kfree(domain); + amd_iommu_domain_free(&domain->domain); return ERR_PTR(ret); } - domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(pgtable); - domain->domain.geometry.force_aperture = true; - domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; + /* + * Narrow the supported page sizes to those selected by the kernel + * command line. + */ + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; + return &domain->domain; +} - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - domain->domain.ops = iommu->iommu.ops->default_domain_ops; +static const struct iommu_domain_ops amdv2_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + /* + * Note the AMDv2 page table format does not support a Force Coherency + * bit, so enforce_cache_coherency should not be set. However VFIO is + * not prepared to handle a case where some domains will support + * enforcement and others do not. VFIO and iommufd will have to be fixed + * before it can fully use the V2 page table. See the comment in + * iommufd_hwpt_paging_alloc(). For now leave things as they have + * historically been and lie about enforce_cache_coherencey. + */ + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; - if (dirty_tracking) - domain->domain.dirty_ops = &amd_dirty_ops; +static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, + u32 flags) +{ + struct pt_iommu_x86_64_cfg cfg = {}; + struct protection_domain *domain; + int ret; + if (!amd_iommu_v2_pgtbl_supported()) + return ERR_PTR(-EOPNOTSUPP); + + domain = protection_domain_alloc(); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->pd_mode = PD_MODE_V2; + domain->iommu.nid = dev_to_node(dev); + + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + /* + * The v2 table behaves differently if it is attached to PASID 0 vs a + * non-zero PASID. On PASID 0 it has no sign extension and the full + * 57/48 bits decode the lower addresses. Otherwise it behaves like a + * normal sign extended x86 page table. Since we want the domain to work + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not + * set which creates a table that is compatible in both modes. + */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { + cfg.common.hw_max_vasz_lg2 = 56; + cfg.top_level = 4; + } else { + cfg.common.hw_max_vasz_lg2 = 47; + cfg.top_level = 3; + } + cfg.common.hw_max_oasz_lg2 = 52; + domain->domain.ops = &amdv2_ops; + + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); + if (ret) { + amd_iommu_domain_free(&domain->domain); + return ERR_PTR(ret); + } return &domain->domain; } @@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, /* Allocate domain with v1 page table for dirty tracking */ if (!amd_iommu_hd_support(iommu)) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); + return amd_iommu_domain_alloc_paging_v1(dev, flags); case IOMMU_HWPT_ALLOC_PASID: /* Allocate domain with v2 page table if IOMMU supports PASID. */ if (!amd_iommu_pasid_supported()) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); - case 0: + return amd_iommu_domain_alloc_paging_v2(dev, flags); + case 0: { + struct iommu_domain *ret; + /* If nothing specific is required use the kernel commandline default */ - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); + if (amd_iommu_pgtable == PD_MODE_V1) { + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v2(dev, flags); + } + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v1(dev, flags); + } default: break; } @@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom) struct protection_domain *domain = to_pdomain(dom); WARN_ON(!list_empty(&domain->dev_list)); - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) - free_io_pgtable_ops(&domain->iop.pgtbl.ops); + pt_iommu_deinit(&domain->iommu); pdom_id_free(domain->id); kfree(domain); } static int blocked_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain); } -/* Same as blocked domain except it supports only ops->attach_dev() */ -static struct iommu_domain release_domain = { - .type = IOMMU_DOMAIN_BLOCKED, - .ops = &(const struct iommu_domain_ops) { - .attach_dev = blocked_domain_attach_device, - } -}; - -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); @@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - if (ops->map_pages) - domain_flush_np_cache(domain, iova, size); - return 0; -} - -static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int iommu_prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - int prot = 0; - int ret = -EINVAL; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return -EINVAL; - - if (iommu_prot & IOMMU_READ) - prot |= IOMMU_PROT_IR; - if (iommu_prot & IOMMU_WRITE) - prot |= IOMMU_PROT_IW; - - if (ops->map_pages) { - ret = ops->map_pages(ops, iova, paddr, pgsize, - pgcount, prot, gfp, mapped); - } - - return ret; -} - -static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size) -{ - /* - * AMD's IOMMU can flush as many pages as necessary in a single flush. - * Unless we run in a virtual machine, which can be inferred according - * to whether "non-present cache" is on, it is probably best to prefer - * (potentially) too extensive TLB flushing (i.e., more misses) over - * mutliple TLB flushes (i.e., more flushes). For virtual machines the - * hypervisor needs to synchronize the host IOMMU PTEs with those of - * the guest, and the trade-off is different: unnecessary TLB flushes - * should be avoided. - */ - if (amd_iommu_np_cache && - iommu_iotlb_gather_is_disjoint(gather, iova, size)) - iommu_iotlb_sync(domain, gather); - - iommu_iotlb_gather_add_range(gather, iova, size); -} - -static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - size_t r; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return 0; - - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; - - if (r) - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); - - return r; -} - -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - dma_addr_t iova) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - return ops->iova_to_phys(ops, iova); -} - static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) { switch (cap) { @@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct protection_domain *pdomain = to_pdomain(domain); - struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; - unsigned long lflags; - - if (!ops || !ops->read_and_clear_dirty) - return -EOPNOTSUPP; - - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); -} - static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_all(dom); - spin_unlock_irqrestore(&dom->lock, flags); -} - -static void amd_iommu_iotlb_sync(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_pages(dom, gather->start, - gather->end - gather->start + 1); - spin_unlock_irqrestore(&dom->lock, flags); -} - static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; @@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -static const struct iommu_dirty_ops amd_dirty_ops = { - .set_dirty_tracking = amd_iommu_set_dirty_tracking, - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, -}; - const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, - .release_domain = &release_domain, + .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, @@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .def_domain_type = amd_iommu_def_domain_type, .page_response = amd_iommu_page_response, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = amd_iommu_attach_device, - .map_pages = amd_iommu_map_pages, - .unmap_pages = amd_iommu_unmap_pages, - .iotlb_sync_map = amd_iommu_iotlb_sync_map, - .iova_to_phys = amd_iommu_iova_to_phys, - .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_sync = amd_iommu_iotlb_sync, - .free = amd_iommu_domain_free, - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, - } }; #ifdef CONFIG_IRQ_REMAP @@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, struct irte_ga *irte) { - bool ret; + int ret; ret = __modify_irte_ga(iommu, devid, index, irte); if (ret) @@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } #endif + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 95a4e62b8f63..83a5aabcd15d 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, } static int apple_dart_attach_dev_paging(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret, i; struct apple_dart_stream_map *stream_map; @@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain, } static int apple_dart_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = { }; static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -802,6 +805,8 @@ static int apple_dart_of_xlate(struct device *dev, struct apple_dart *cfg_dart; int i, sid; + put_device(&iommu_pdev->dev); + if (args->args_count != 1) return -EINVAL; sid = args->args[0]; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 8cd8929bbfdf..93fdadd07431 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste( int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain) { + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); struct arm_smmu_vmaster *vmaster; unsigned long vsid; int ret; @@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, state->master->dev, &vsid); - if (ret) + /* + * Attaching to a translate nested domain must allocate a vDEVICE prior, + * as CD/ATS invalidations and vevents require a vSID to work properly. + * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case. + */ + if (ret) { + if (cfg == STRTAB_STE_0_CFG_ABORT || + cfg == STRTAB_STE_0_CFG_BYPASS) + return 0; return ret; + } vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); if (!vmaster) @@ -138,14 +149,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) } static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_nested_domain *nested_domain = to_smmu_nested_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_ste ste; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2a8b46b948f0..d16d35c78c06 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1464,7 +1464,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, GFP_KERNEL); - if (!cd_table->l2.l2ptrs) { + if (!cd_table->l2.l1tab) { ret = -ENOMEM; goto err_free_l2ptrs; } @@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) master->ats_enabled = state->ats_enabled; } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old_domain) { int ret = 0; struct arm_smmu_ste target; @@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_attach_state state = { - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_master *master; @@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, /* * When the last user of the CD table goes away downgrade the STE back - * to a non-cd_table one. + * to a non-cd_table one, by re-attaching its sid_domain. */ if (!arm_smmu_ssids_in_use(&master->cd_table)) { struct iommu_domain *sid_domain = @@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || sid_domain->type == IOMMU_DOMAIN_BLOCKED) - sid_domain->ops->attach_dev(sid_domain, dev); + sid_domain->ops->attach_dev(sid_domain, dev, + sid_domain); } return 0; } static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct iommu_domain *old_domain, struct device *dev, struct arm_smmu_ste *ste, unsigned int s1dss) @@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; @@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_BYPASS); return 0; } @@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); return 0; } @@ -3582,12 +3588,6 @@ static void arm_smmu_release_device(struct device *dev) WARN_ON(master->iopf_refcount); - /* Put the STE back to what arm_smmu_init_strtab() sets */ - if (dev->iommu->require_direct) - arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); - else - arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); - arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); if (arm_smmu_cdtab_allocated(&master->cd_table)) @@ -3678,6 +3678,7 @@ static int arm_smmu_def_domain_type(struct device *dev) static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, + .release_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .hw_info = arm_smmu_hw_info, .domain_alloc_sva = arm_smmu_sva_domain_alloc, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 57c097e87613..573085349df3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -367,6 +367,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno" }, { .compatible = "qcom,adreno-gmu" }, + { .compatible = "qcom,glymur-mdss" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, { .compatible = "qcom,qcm2290-mdss" }, @@ -431,17 +432,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) /* * Some platforms support more than the Arm SMMU architected maximum of - * 128 stream matching groups. For unknown reasons, the additional - * groups don't exhibit the same behavior as the architected registers, - * so limit the groups to 128 until the behavior is fixed for the other - * groups. + * 128 stream matching groups. The additional registers appear to have + * the same behavior as the architected registers in the hardware. + * However, on some firmware versions, the hypervisor does not + * correctly trap and emulate accesses to the additional registers, + * resulting in unexpected behavior. + * + * If there are more than 128 groups, use the last reliable group to + * detect if we need to apply the bypass quirk. */ - if (smmu->num_mapping_groups > 128) { - dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); - smmu->num_mapping_groups = 128; - } - - last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); + if (smmu->num_mapping_groups > 128) + last_s2cr = ARM_SMMU_GR0_S2CR(127); + else + last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); /* * With some firmware versions writes to S2CR of type FAULT are @@ -464,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS); arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg); + + if (smmu->num_mapping_groups > 128) { + dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); + smmu->num_mapping_groups = 128; + } } for (i = 0; i < smmu->num_mapping_groups; i++) { diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4ced4b5bee4d..5e690cf85ec9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, } } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); } @@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index c5be95e56031..f69d9276dc55 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) kfree(qcom_domain); } -static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int qcom_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); @@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev } static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - qcom_domain = to_qcom_iommu_domain(domain); + qcom_domain = to_qcom_iommu_domain(old); if (WARN_ON(!qcom_domain->iommu)) return -EINVAL; @@ -565,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev, qcom_iommu = platform_get_drvdata(iommu_pdev); + put_device(&iommu_pdev->dev); + /* make sure the asid specified in dt is valid, so we don't have * to sanity check this elsewhere: */ if (WARN_ON(asid > qcom_iommu->max_asid) || - WARN_ON(qcom_iommu->ctxs[asid] == NULL)) { - put_device(&iommu_pdev->dev); + WARN_ON(qcom_iommu->ctxs[asid] == NULL)) return -EINVAL; - } if (!dev_iommu_priv_get(dev)) { dev_iommu_priv_set(dev, qcom_iommu); @@ -581,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev, * multiple different iommu devices. Multiple context * banks are ok, but multiple devices are not: */ - if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) { - put_device(&iommu_pdev->dev); + if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) return -EINVAL; - } } return iommu_fwspec_add_ids(dev, &asid, 1); diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f1fb27681b0b..c92088855450 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index b6edd178fe25..b512c6b939ac 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) } static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct exynos_iommu_domain *domain; @@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = { }; static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); @@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, unsigned long flags; int err; - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); if (err) return err; @@ -1429,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct sysmmu_drvdata *data; - WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); - list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); } @@ -1446,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev, return -ENODEV; data = platform_get_drvdata(sysmmu); - if (!data) { - put_device(&sysmmu->dev); + put_device(&sysmmu->dev); + if (!data) return -ENODEV; - } if (!owner) { owner = kzalloc(sizeof(*owner), GFP_KERNEL); - if (!owner) { - put_device(&sysmmu->dev); + if (!owner) return -ENOMEM; - } INIT_LIST_HEAD(&owner->controllers); mutex_init(&owner->rpm_lock); @@ -1476,6 +1473,7 @@ static int exynos_iommu_of_xlate(struct device *dev, static const struct iommu_ops exynos_iommu_ops = { .identity_domain = &exynos_identity_domain, + .release_domain = &exynos_identity_domain, .domain_alloc_paging = exynos_iommu_domain_alloc_paging, .device_group = generic_device_group, .probe_device = exynos_iommu_probe_device, diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 5f08523f97cb..9664ef9840d2 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val) } static int fsl_pamu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); unsigned long flags; @@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, * switches to what looks like BLOCKING. */ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct fsl_dma_domain *dma_domain; const u32 *prop; int len; @@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, * Hack to keep things working as they always have, only leaving an * UNMANAGED domain makes it BLOCKING. */ - if (domain == platform_domain || !domain || - domain->type != IOMMU_DOMAIN_UNMANAGED) + if (old == platform_domain || !old || + old->type != IOMMU_DOMAIN_UNMANAGED) return 0; - dma_domain = to_fsl_dma_domain(domain); + dma_domain = to_fsl_dma_domain(old); /* * Use LIODN of the PCI controller while detaching a diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig new file mode 100644 index 000000000000..52ac9e661ffd --- /dev/null +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -0,0 +1,14 @@ +CONFIG_KUNIT=y +CONFIG_GENERIC_PT=y +CONFIG_DEBUG_GENERIC_PT=y +CONFIG_IOMMU_PT=y +CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y +CONFIG_IOMMU_PT_X86_64=y +CONFIG_IOMMU_PT_KUNIT_TEST=y + +CONFIG_IOMMUFD=y +CONFIG_DEBUG_KERNEL=y +CONFIG_FAULT_INJECTION=y +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_IOMMUFD_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig new file mode 100644 index 000000000000..ce4fb4786914 --- /dev/null +++ b/drivers/iommu/generic_pt/Kconfig @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig GENERIC_PT + bool "Generic Radix Page Table" if COMPILE_TEST + help + Generic library for building radix tree page tables. + + Generic PT provides a set of HW page table formats and a common + set of APIs to work with them. + +if GENERIC_PT +config DEBUG_GENERIC_PT + bool "Extra debugging checks for GENERIC_PT" + help + Enable extra run time debugging checks for GENERIC_PT code. This + incurs a runtime cost and should not be enabled for production + kernels. + + The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related to struct iommu_domain using GENERIC_PT. It provides a single + implementation of the page table operations that can be shared by + multiple drivers. + +if IOMMU_PT +config IOMMU_PT_AMDV1 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the AMD v1 page table. AMDv1 is the + "host" page table. It supports granular page sizes of almost every + power of 2 and decodes the full 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_KUNIT_TEST + tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS + default KUNIT_ALL_TESTS + help + Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the + enabled page table formats. The test covers most of the GENERIC_PT + functions provided by the page table format, as well as covering the + iommu_domain related functions. + +endif +endif diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile new file mode 100644 index 000000000000..976b49ec97dc --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 +iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + +IOMMU_PT_KUNIT_TEST := +define create_format +obj-$(2) += iommu_$(1).o +iommu_pt_kunit_test-y += kunit_iommu_$(1).o +CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1 +IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o + +endef + +$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) + +# The kunit objects are constructed by compiling the main source +# with -DGENERIC_PT_KUNIT +$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + +obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 000000000000..aa8e1a8ec95f --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * AMD IOMMU v1 page table + * + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" + * + * Note the level numbering here matches the core code, so level 0 is the same + * as mode 1. + * + */ +#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H + +#include "defs_amdv1.h" +#include "../pt_defs.h" + +#include <asm/page.h> +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/mem_encrypt.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/string.h> + +enum { + PT_ITEM_WORD_SIZE = sizeof(u64), + /* + * The IOMMUFD selftest uses the AMDv1 format with some alterations It + * uses a 2k page size to test cases where the CPU page size is not the + * same. + */ +#ifdef AMDV1_IOMMUFD_SELFTEST + PT_MAX_VA_ADDRESS_LG2 = 56, + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 11, +#else + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_TOP_LEVEL = 5, + PT_GRANULE_LG2SZ = 12, +#endif + PT_TABLEMEM_LG2SZ = 12, + + /* The DTE only has these bits for the top phyiscal address */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* PTE bits */ +enum { + AMDV1PT_FMT_PR = BIT(0), + AMDV1PT_FMT_D = BIT(6), + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), + AMDV1PT_FMT_FC = BIT_ULL(60), + AMDV1PT_FMT_IR = BIT_ULL(61), + AMDV1PT_FMT_IW = BIT_ULL(62), +}; + +/* + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make + * these defines to avoid it. + */ +#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7 + +static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa amdv1pt_table_pa + +/* Returns the oa for the start of the contiguous entry */ +static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + pt_oaddr_t oa; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { + unsigned int sz_bits = oaffz(oa); + + oa = oalog2_set_mod(oa, 0, sz_bits); + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != + AMDV1PT_FMT_NL_DEFAULT)) + return 0; + return oalog2_mul(oa, PT_GRANULE_LG2SZ); +} +#define pt_entry_oa amdv1pt_entry_oa + +static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) +{ + /* + * Table 15: Page Table Level Parameters + * The top most level cannot have translation entries + */ + return pts->level < PT_MAX_TOP_LEVEL; +} +#define pt_can_have_leaf amdv1pt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + u32 code; + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == + AMDV1PT_FMT_NL_DEFAULT) + return ilog2(1); + + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != + AMDV1PT_FMT_NL_SIZE); + + /* + * The contiguous size is encoded in the length of a string of 1's in + * the low bits of the OA. Reverse the equation: + * code = log2_to_int(num_contig_lg2 + item_lg2sz - + * PT_GRANULE_LG2SZ - 1) - 1 + * Which can be expressed as: + * num_contig_lg2 = oalog2_ffz(code) + 1 - + * item_lg2sz - PT_GRANULE_LG2SZ + * + * Assume the bit layout is correct and remove the masking. Reorganize + * the equation to move all the arithmetic before the ffz. + */ + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); + return ffz_t(u32, code); +} +#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 + +static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) +{ + /* + * Top entry covers bits [63:57] only, this is handled through + * max_vasz_lg2. + */ + if (PT_WARN_ON(pts->level == 5)) + return 7; + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 amdv1pt_num_items_lg2 + +static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!amdv1pt_can_have_leaf(pts)) + return 0; + + /* + * Table 14: Example Page Size Encodings + * Address bits 51:32 can be used to encode page sizes greater than 4 + * Gbytes. Address bits 63:52 are zero-extended. + * + * 512GB Pages are not supported due to a hardware bug. + * Otherwise every power of two size is supported. + */ + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), + isz_lg2) & ~SZ_512G; +} +#define pt_possible_sizes amdv1pt_possible_sizes + +static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; + unsigned int next_level; + u64 entry; + + pts->entry = entry = READ_ONCE(*tablep); + if (!(entry & AMDV1PT_FMT_PR)) + return PT_ENTRY_EMPTY; + + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || + next_level == AMDV1PT_FMT_NL_SIZE) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw amdv1pt_load_entry_raw + +static inline void +amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (oasz_lg2 == isz_lg2) { + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_DEFAULT); + WRITE_ONCE(*tablep, entry); + } else { + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_SIZE) | + FIELD_PREP(AMDV1PT_FMT_OA, + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - + 1) - + 1); + + /* See amdv1pt_clear_entries() */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + memset64(tablep, entry, log2_to_int(num_contig_lg2)); + } + } + pts->entry = entry; +} +#define pt_install_leaf_entry amdv1pt_install_leaf_entry + +static inline bool amdv1pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + /* + * IR and IW are ANDed from the table levels along with the PTE. We + * always control permissions from the PTE, so always set IR and IW for + * tables. + */ + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | + FIELD_PREP(AMDV1PT_FMT_OA, + log2_div(table_pa, PT_GRANULE_LG2SZ)) | + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table amdv1pt_install_table + +static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); +} +#define pt_attr_from_entry amdv1pt_attr_from_entry + +static inline void amdv1pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + /* + * gcc generates rep stos for the io-pgtable code, and this difference + * can show in microbenchmarks with larger contiguous page sizes. + * rep is slower for small cases. + */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); + } else { + memset64(tablep, 0, log2_to_int(num_contig_lg2)); + } +} +#define pt_clear_entries amdv1pt_clear_entries + +static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) + return true; + return false; +} +#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty + +static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); +} +#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean + +static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | AMDV1PT_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_amdv1 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) + ->amdpt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; +} + +static inline int amdv1pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) + pte |= AMDV1PT_FMT_FC; + if (iommu_prot & IOMMU_READ) + pte |= AMDV1PT_FMT_IR; + if (iommu_prot & IOMMU_WRITE) + pte |= AMDV1PT_FMT_IW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot amdv1pt_iommu_set_prot + +static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, + const struct pt_iommu_amdv1_cfg *cfg) +{ + struct pt_amdv1 *table = &iommu_table->amdpt; + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) + return -EINVAL; + + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && + cfg->starting_level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * + (cfg->starting_level + 1); + + table->common.max_vasz_lg2 = + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + pt_top_set_level(&table->common, cfg->starting_level); + return 0; +} +#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init + +#ifndef PT_FMT_VARIANT +static inline void +amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, + const struct pt_range *top_range, + struct pt_iommu_amdv1_hw_info *info) +{ + info->host_pt_root = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); + info->mode = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { + /* Matches what io_pgtable does */ + [0] = { .starting_level = 2 }, +}; +#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = 0 }; +#endif + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h new file mode 100644 index 000000000000..0b9614ca6d10 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H +#define __GENERIC_PT_FMT_DEFS_AMDV1_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct amdv1pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs amdv1pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 000000000000..4a239bcaae2a --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c new file mode 100644 index 000000000000..72a2337d0c55 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT amdv1 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) +#define PT_FORCE_ENABLED_FEATURES \ + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c new file mode 100644 index 000000000000..74e597cba9d9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define AMDV1_IOMMUFD_SELFTEST 1 +#define PT_FMT amdv1 +#define PT_FMT_VARIANT mock +#define PT_SUPPORTED_FEATURES 0 + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 000000000000..d28e86abdf2e --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT <name> + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT <suffix> + */ +#include <linux/args.h> +#include <linux/stringify.h> + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include <linux/generic_pt/common.h> +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#ifndef GENERIC_PT_KUNIT +#include "../iommu_pt.h" +#else +/* + * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set + * which means we are building the kunit modle. + */ +#include "../kunit_generic_pt.h" +#include "../kunit_iommu_pt.h" +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 000000000000..f551711e2a33 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5472660c2d71 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 000000000000..f5f8981edde7 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 10) + BUILD_BUG(); + + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + + if (cfg->top_level > 4 || cfg->top_level < 2) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2}, + [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3}, + [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4}, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..210748d9d6e8 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> +#include <linux/mem_encrypt.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common) +{ + return 12; +} +#define pt_max_sw_bit x86_64_pt_max_sw_bit + +static inline u64 x86_64_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 12) + BUILD_BUG(); + + /* Bits marked Ignored/AVL in the specification */ + switch (bitnr) { + case 0: + return BIT(9); + case 1: + return BIT(11); + case 2 ... 12: + return BIT_ULL((bitnr - 2) + 52); + /* Some bits in 8,6,4,3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit x86_64_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW | X86_64_FMT_D; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->top_level < 3 || cfg->top_level > 4) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48, .top_level = 3 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57, .top_level = 4 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 }, + [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4}, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 000000000000..97aeda1ad01c --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,1289 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include <linux/export.h> +#include <linux/iommu.h> +#include "../iommu-pages.h" +#include <linux/cleanup.h> +#include <linux/dma-mapping.h> + +enum { + SW_BIT_CACHE_FLUSH_DONE = 0, +}; + +static void flush_writes_range(const struct pt_state *pts, + unsigned int start_index, unsigned int end_index) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, start_index * PT_ITEM_WORD_SIZE, + (end_index - start_index) * PT_ITEM_WORD_SIZE); +} + +static void flush_writes_item(const struct pt_state *pts) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, pts->index * PT_ITEM_WORD_SIZE, + PT_ITEM_WORD_SIZE); +} + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(free_list, + iommu_table->iommu_device); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +} + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +static int make_range_ul(struct pt_common *common, struct pt_range *range, + unsigned long iova, unsigned long len) +{ + unsigned long last; + + if (unlikely(len == 0)) + return -EINVAL; + + if (check_add_overflow(iova, len - 1, &last)) + return -EOVERFLOW; + + *range = pt_make_range(common, iova, last); + if (sizeof(iova) > sizeof(range->va)) { + if (unlikely(range->va != iova || range->last_va != last)) + return -EOVERFLOW; + } + return 0; +} + +static __maybe_unused int make_range_u64(struct pt_common *common, + struct pt_range *range, u64 iova, + u64 len) +{ + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) + return -EOVERFLOW; + return make_range_ul(common, range, iova, len); +} + +/* + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch + * to the correct validation based on the type. + */ +#define make_range_no_check(common, range, iova, len) \ + ({ \ + int ret; \ + if (sizeof(iova) > sizeof(unsigned long) || \ + sizeof(len) > sizeof(unsigned long)) \ + ret = make_range_u64(common, range, iova, len); \ + else \ + ret = make_range_ul(common, range, iova, len); \ + ret; \ + }) + +#define make_range(common, range, iova, len) \ + ({ \ + int ret = make_range_no_check(common, range, iova, len); \ + if (!ret) \ + ret = pt_check_range(range); \ + ret; \ + }) + +static inline unsigned int compute_best_pgsize(struct pt_state *pts, + pt_oaddr_t oa) +{ + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); + + if (!pt_can_have_leaf(pts)) + return 0; + + /* + * The page size is limited by the domain's bitmap. This allows the core + * code to reduce the supported page sizes by changing the bitmap. + */ + return pt_compute_best_pgsize(pt_possible_sizes(pts) & + iommu_table->domain.pgsize_bitmap, + pts->range->va, pts->range->last_va, oa); +} + +static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + pt_oaddr_t *res = arg; + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, descend_fn); + case PT_ENTRY_OA: + *res = pt_entry_oa_exact(&pts); + return 0; + } + return -ENOENT; +} +PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); + +/** + * iova_to_phys() - Return the output address for the given IOVA + * @domain: Table to query + * @iova: IO virtual address to query + * + * Determine the output address from the given IOVA. @iova may have any + * alignment, the returned physical will be adjusted with any sub page offset. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Return: 0 if there is no translation for the given iova. + */ +phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_range range; + pt_oaddr_t res; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + ret = pt_walk_range(&range, __iova_to_phys, &res); + /* PHYS_ADDR_MAX would be a better error code */ + if (ret) + return 0; + return res; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); + +struct pt_iommu_dirty_args { + struct iommu_dirty_bitmap *dirty; + unsigned int flags; +}; + +static void record_dirty(struct pt_state *pts, + struct pt_iommu_dirty_args *dirty, + unsigned int num_contig_lg2) +{ + pt_vaddr_t dirty_len; + + if (num_contig_lg2 != ilog2(1)) { + unsigned int index = pts->index; + unsigned int end_index = log2_set_mod_max_t( + unsigned int, pts->index, num_contig_lg2); + + /* Adjust for being contained inside a contiguous page */ + end_index = min(end_index, pts->end_index); + dirty_len = (end_index - index) * + log2_to_int(pt_table_item_lg2sz(pts)); + } else { + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); + } + + if (dirty->dirty->bitmap) + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, + dirty_len); + + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { + /* + * No write log required because DMA incoherence and atomic + * dirty tracking bits can't work together + */ + pt_entry_make_write_clean(pts); + iommu_iotlb_gather_add_range(dirty->dirty->gather, + pts->range->va, dirty_len); + } +} + +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_dirty_args *dirty = arg; + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + ret = pt_descend(&pts, arg, __read_and_clear_dirty); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) + record_dirty(&pts, dirty, + pt_entry_num_contig_lg2(&pts)); + } + return 0; +} + +/** + * read_and_clear_dirty() - Manipulate the HW set write dirty state + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the IOVA + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR + * @dirty: Place to store the dirty bits + * + * Iterate over all the entries in the mapped range and record their write dirty + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then + * the entries will be left dirty, otherwise they are returned to being not + * write dirty. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_iommu_dirty_args dirty_args = { + .dirty = dirty, + .flags = flags, + }; + struct pt_range range; + int ret; + +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) + return -EOPNOTSUPP; +#endif + + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); + if (ret) + return ret; + + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); + PT_WARN_ON(ret); + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); + +static inline int __set_dirty(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __set_dirty); + case PT_ENTRY_OA: + if (!pt_entry_make_write_dirty(&pts)) + return -EAGAIN; + return 0; + } + return -ENOENT; +} + +static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, + dma_addr_t iova) +{ + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + /* + * Note: There is no locking here yet, if the test suite races this it + * can crash. It should use RCU locking eventually. + */ + return pt_walk_range(&range, __set_dirty, NULL); +} + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; + /* Fail if any OAs are within the range */ + u8 check_mapped : 1; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (!collect->check_mapped && !pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && collect->check_mapped) + return -EADDRINUSE; + } + return 0; +} + +enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; + +/* Allocate a table, the empty table will be ready to be installed. */ +static inline struct pt_table_p *_table_alloc(struct pt_common *common, + size_t lg2sz, gfp_t gfp, + enum alloc_mode mode) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + struct pt_table_p *table_mem; + + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, + log2_to_int(lg2sz)); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + mode == ALLOC_NORMAL) { + int ret = iommu_pages_start_incoherent( + table_mem, iommu_table->iommu_device); + if (ret) { + iommu_free_pages(table_mem); + return ERR_PTR(ret); + } + } + return table_mem; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp, + enum alloc_mode mode) +{ + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), + gfp, mode); +} + +/* Allocate an interior table */ +static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, + gfp_t gfp, enum alloc_mode mode) +{ + struct pt_state child_pts = + pt_init(parent_pts->range, parent_pts->level - 1, NULL); + + return _table_alloc(parent_pts->range->common, + pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE), + gfp, mode); +} + +static inline int pt_iommu_new_table(struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + struct pt_table_p *table_mem; + phys_addr_t phys; + + /* Given PA/VA/length can't be represented */ + if (PT_WARN_ON(!pt_can_have_table(pts))) + return -ENXIO; + + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + + phys = virt_to_phys(table_mem); + if (!pt_install_table(pts, phys, attrs)) { + iommu_pages_free_incoherent( + table_mem, + iommu_from_common(pts->range->common)->iommu_device); + return -EAGAIN; + } + + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) { + flush_writes_item(pts); + pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE); + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + /* + * The underlying table can't store the physical table address. + * This happens when kunit testing tables outside their normal + * environment where a CPU might be limited. + */ + pt_load_single_entry(pts); + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { + pt_clear_entries(pts, ilog2(1)); + iommu_pages_free_incoherent( + table_mem, iommu_from_common(pts->range->common) + ->iommu_device); + return -EINVAL; + } + } + + pts->table_lower = table_mem; + return 0; +} + +struct pt_iommu_map_args { + struct iommu_iotlb_gather *iotlb_gather; + struct pt_write_attrs attrs; + pt_oaddr_t oa; + unsigned int leaf_pgsize_lg2; + unsigned int leaf_level; +}; + +/* + * This will recursively check any tables in the block to validate they are + * empty and then free them through the gather. + */ +static int clear_contig(const struct pt_state *start_pts, + struct iommu_iotlb_gather *iotlb_gather, + unsigned int step, unsigned int pgsize_lg2) +{ + struct pt_iommu *iommu_table = + iommu_from_common(start_pts->range->common); + struct pt_range range = *start_pts->range; + struct pt_state pts = + pt_init(&range, start_pts->level, start_pts->table); + struct pt_iommu_collect_args collect = { .check_mapped = true }; + int ret; + + pts.index = start_pts->index; + pts.end_index = start_pts->index + step; + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { + if (pts.type == PT_ENTRY_TABLE) { + collect.free_list = + IOMMU_PAGES_LIST_INIT(collect.free_list); + ret = pt_walk_descend_all(&pts, __collect_tables, + &collect); + if (ret) + return ret; + + /* + * The table item must be cleared before we can update + * the gather + */ + pt_clear_entries(&pts, ilog2(1)); + flush_writes_item(&pts); + + iommu_pages_list_add(&collect.free_list, + pt_table_ptr(&pts)); + gather_range_pages( + iotlb_gather, iommu_table, range.va, + log2_to_int(pt_table_item_lg2sz(&pts)), + &collect.free_list); + } else if (pts.type != PT_ENTRY_EMPTY) { + return -EADDRINUSE; + } + } + return 0; +} + +static int __map_range_leaf(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int start_index; + pt_oaddr_t oa = map->oa; + unsigned int step; + bool need_contig; + int ret = 0; + + PT_WARN_ON(map->leaf_level != level); + PT_WARN_ON(!pt_can_have_leaf(&pts)); + + step = log2_to_int_t(unsigned int, + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + + _pt_iter_first(&pts); + start_index = pts.index; + do { + pts.type = pt_load_entry_raw(&pts); + if (pts.type != PT_ENTRY_EMPTY || need_contig) { + if (pts.index != start_index) + pt_index_to_va(&pts); + ret = clear_contig(&pts, map->iotlb_gather, step, + leaf_pgsize_lg2); + if (ret) + break; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + pt_index_to_va(&pts); + PT_WARN_ON(compute_best_pgsize(&pts, oa) != + leaf_pgsize_lg2); + } + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); + + oa += log2_to_int(leaf_pgsize_lg2); + pts.index += step; + } while (pts.index < pts.end_index); + + flush_writes_range(&pts, start_index, pts.index); + + map->oa = oa; + return ret; +} + +static int __map_range(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + int ret; + + PT_WARN_ON(map->leaf_level == level); + PT_WARN_ON(!pt_can_have_table(&pts)); + + _pt_iter_first(&pts); + + /* Descend to a child table */ + do { + pts.type = pt_load_entry_raw(&pts); + + if (pts.type != PT_ENTRY_TABLE) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + ret = pt_iommu_new_table(&pts, &map->attrs); + if (ret) { + /* + * Racing with another thread installing a table + */ + if (ret == -EAGAIN) + continue; + return ret; + } + } else { + pts.table_lower = pt_table_ptr(&pts); + /* + * Racing with a shared pt_iommu_new_table()? The other + * thread is still flushing the cache, so we have to + * also flush it to ensure that when our thread's map + * completes all the table items leading to our mapping + * are visible. + * + * This requires the pt_set_bit_release() to be a + * release of the cache flush so that this can acquire + * visibility at the iommu. + */ + if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) && + !pt_test_sw_bit_acquire(&pts, + SW_BIT_CACHE_FLUSH_DONE)) + flush_writes_item(&pts); + } + + /* + * The already present table can possibly be shared with another + * concurrent map. + */ + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + if (ret) + return ret; + + pts.index++; + pt_index_to_va(&pts); + if (pts.index >= pts.end_index) + break; + } while (true); + return 0; +} + +/* + * Fast path for the easy case of mapping a 4k page to an already allocated + * table. This is a common workload. If it returns EAGAIN run the full algorithm + * instead. + */ +static __always_inline int __do_map_single_page(struct pt_range *range, + void *arg, unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + + pts.type = pt_load_single_entry(&pts); + if (level == 0) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, + &map->attrs); + /* No flush, not used when incoherent */ + map->oa += PAGE_SIZE; + return 0; + } + if (pts.type == PT_ENTRY_TABLE) + return pt_descend(&pts, arg, descend_fn); + /* Something else, use the slow path */ + return -EAGAIN; +} +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); + +/* + * Add a table to the top, increasing the top level as much as necessary to + * encompass range. + */ +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); + struct pt_common *common = common_from_iommu(iommu_table); + uintptr_t top_of_table = READ_ONCE(common->top_of_table); + uintptr_t new_top_of_table = top_of_table; + struct pt_table_p *table_mem; + unsigned int new_level; + spinlock_t *domain_lock; + unsigned long flags; + int ret; + + while (true) { + struct pt_range top_range = + _pt_top_range(common, new_top_of_table); + struct pt_state pts = pt_init_top(&top_range); + + top_range.va = range->va; + top_range.last_va = range->last_va; + + if (!pt_check_range(&top_range) && + map->leaf_level <= pts.level) { + new_level = pts.level; + break; + } + + pts.level++; + if (pts.level > PT_MAX_TOP_LEVEL || + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { + ret = -ERANGE; + goto err_free; + } + + table_mem = + table_alloc_top(common, _pt_top_set(NULL, pts.level), + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); + if (IS_ERR(table_mem)) { + ret = PTR_ERR(table_mem); + goto err_free; + } + iommu_pages_list_add(&free_list, table_mem); + + /* The new table links to the lower table always at index 0 */ + top_range.va = 0; + top_range.top_level = pts.level; + pts.table_lower = pts.table; + pts.table = table_mem; + pt_load_single_entry(&pts); + PT_WARN_ON(pts.index != 0); + pt_install_table(&pts, virt_to_phys(pts.table_lower), + &map->attrs); + new_top_of_table = _pt_top_set(pts.table, pts.level); + } + + /* + * Avoid double flushing, flush it once after all pt_install_table() + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = iommu_pages_start_incoherent_list( + &free_list, iommu_table->iommu_device); + if (ret) + goto err_free; + } + + /* + * top_of_table is write locked by the spinlock, but readers can use + * READ_ONCE() to get the value. Since we encode both the level and the + * pointer in one quanta the lockless reader will always see something + * valid. The HW must be updated to the new level under the spinlock + * before top_of_table is updated so that concurrent readers don't map + * into the new level until it is fully functional. If another thread + * already updated it while we were working then throw everything away + * and try again. + */ + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); + spin_lock_irqsave(domain_lock, flags); + if (common->top_of_table != top_of_table || + top_of_table == new_top_of_table) { + spin_unlock_irqrestore(domain_lock, flags); + ret = -EAGAIN; + goto err_free; + } + + /* + * We do not issue any flushes for change_top on the expectation that + * any walk cache will not become a problem by adding another layer to + * the tree. Misses will rewalk from the updated top pointer, hits + * continue to be correct. Negative caching is fine too since all the + * new IOVA added by the new top is non-present. + */ + iommu_table->driver_ops->change_top( + iommu_table, virt_to_phys(table_mem), new_level); + WRITE_ONCE(common->top_of_table, new_top_of_table); + spin_unlock_irqrestore(domain_lock, flags); + return 0; + +err_free: + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&free_list); + return ret; +} + +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct pt_common *common = common_from_iommu(iommu_table); + int ret; + + do { + ret = pt_check_range(range); + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + return ret; + + if (!ret && map->leaf_level <= range->top_level) + break; + + ret = increase_top(iommu_table, range, map); + if (ret && ret != -EAGAIN) + return ret; + + /* Reload the new top */ + *range = pt_make_range(common, range->va, range->last_va); + } while (ret); + PT_WARN_ON(pt_check_range(range)); + return 0; +} + +static int do_map(struct pt_range *range, struct pt_common *common, + bool single_page, struct pt_iommu_map_args *map) +{ + /* + * The __map_single_page() fast path does not support DMA_INCOHERENT + * flushing to keep its .text small. + */ + if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + int ret; + + ret = pt_walk_range(range, __map_single_page, map); + if (ret != -EAGAIN) + return ret; + /* EAGAIN falls through to the full path */ + } + + if (map->leaf_level == range->top_level) + return pt_walk_range(range, __map_range_leaf, map); + return pt_walk_range(range, __map_range, map); +} + +/** + * map_pages() - Install translation for an IOVA range + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * @mapped: Total bytes successfully mapped + * + * The range starting at IOVA will have paddr installed into it. The caller + * must specify a valid pgsize and pgcount to segment the range into compatible + * blocks. + * + * On error the caller will probably want to invoke unmap on the range from iova + * up to the amount indicated by @mapped to return the table back to an + * unchanged state. + * + * Context: The caller must hold a write range lock that includes the whole + * range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were + * mapped are added to @mapped, @mapped is not zerod first. + */ +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; + struct pt_common *common = common_from_iommu(iommu_table); + struct iommu_iotlb_gather iotlb_gather; + pt_vaddr_t len = pgsize * pgcount; + struct pt_iommu_map_args map = { + .iotlb_gather = &iotlb_gather, + .oa = paddr, + .leaf_pgsize_lg2 = vaffs(pgsize), + }; + bool single_page = false; + struct pt_range range; + int ret; + + iommu_iotlb_gather_init(&iotlb_gather); + + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) + return -EINVAL; + + /* Check the paddr doesn't exceed what the table can store */ + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && + (pt_vaddr_t)paddr > PT_VADDR_MAX) || + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && + oalog2_div(paddr, common->max_oasz_lg2))) + return -ERANGE; + + ret = pt_iommu_set_prot(common, &map.attrs, prot); + if (ret) + return ret; + map.attrs.gfp = gfp; + + ret = make_range_no_check(common, &range, iova, len); + if (ret) + return ret; + + /* Calculate target page size and level for the leaves */ + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && + pgcount == 1) { + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (log2_mod(iova | paddr, PAGE_SHIFT)) + return -ENXIO; + map.leaf_pgsize_lg2 = PAGE_SHIFT; + map.leaf_level = 0; + single_page = true; + } else { + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( + pgsize_bitmap, range.va, range.last_va, paddr); + if (!map.leaf_pgsize_lg2) + return -ENXIO; + map.leaf_level = + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + } + + ret = check_map_range(iommu_table, &range, &map); + if (ret) + return ret; + + PT_WARN_ON(map.leaf_level > range.top_level); + + ret = do_map(&range, common, single_page, &map); + + /* + * Table levels were freed and replaced with large items, flush any walk + * cache that may refer to the freed levels. + */ + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); + + /* Bytes successfully mapped */ + PT_WARN_ON(!ret && map.oa - paddr != len); + *mapped += map.oa - paddr; + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); + +struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + flush_writes_range(&pts, start_index, pts.index); + + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&collect.free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) + .set_dirty = NS(set_dirty), +#endif + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + /* + * Check if the top level of the page table is too small to hold the + * specified maxvasz. + */ + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + top_range.top_level != PT_MAX_TOP_LEVEL) { + struct pt_state pts = { .range = &top_range, + .level = top_range.top_level }; + + if (common->max_vasz_lg2 > + pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts)) + return -EOPNOTSUPP; + } + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_oa_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_oa_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* A 64-bit high address space table on a 32-bit system cannot work. */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end should be called aperture_last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + domain->geometry.force_aperture = true; + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->iommu_device = cfg.iommu_device; + iommu_table->driver_ops = cfg.driver_ops; + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) + +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + WARN_ON(!iommu_table->driver_ops || + !iommu_table->driver_ops->change_top || + !iommu_table->driver_ops->get_top_lock)) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + WARN_ON(!iommu_table->iommu_device)) + return -EINVAL; + + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp, + ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + + /* Must be last, see pt_iommu_deinit() */ + iommu_table->ops = &NS(ops); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD"); + +#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h new file mode 100644 index 000000000000..68278bf15cfe --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -0,0 +1,823 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Test the format API directly. + * + */ +#include "kunit_iommu.h" +#include "pt_iter.h" + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + int ret; + + KUNIT_ASSERT_EQ(test, len, (size_t)len); + + ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE, + GFP_KERNEL); + KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret); +} + +#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \ + ({ \ + pt_load_entry(pts); \ + KUNIT_ASSERT_EQ(test, (pts)->type, entry); \ + }) + +struct check_levels_arg { + struct kunit *test; + void *fn_arg; + void (*fn)(struct kunit *test, struct pt_state *pts, void *arg); +}; + +static int __check_all_levels(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct check_levels_arg *chk = arg; + struct kunit *test = chk->test; + int ret; + + _pt_iter_first(&pts); + + + /* + * If we were able to use the full VA space this should always be the + * last index in each table. + */ + if (!(IS_32BIT && range->max_vasz_lg2 > 32)) { + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) && + pts.level == pts.range->top_level) + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(range->max_vasz_lg2 - 1 - + pt_table_item_lg2sz(&pts)) - + 1); + else + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(pt_table_oa_lg2sz(&pts) - + pt_table_item_lg2sz(&pts)) - + 1); + } + + if (pt_can_have_table(&pts)) { + pt_load_single_entry(&pts); + KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE); + ret = pt_descend(&pts, arg, __check_all_levels); + KUNIT_ASSERT_EQ(test, ret, 0); + + /* Index 0 is used by the test */ + if (IS_32BIT && !pts.index) + return 0; + KUNIT_ASSERT_NE(chk->test, pts.index, 0); + } + + /* + * A format should not create a table with only one entry, at least this + * test approach won't work. + */ + KUNIT_ASSERT_GT(chk->test, pts.end_index, 1); + + /* + * For increase top we end up using index 0 for the original top's tree, + * so use index 1 for testing instead. + */ + pts.index = 0; + pt_index_to_va(&pts); + pt_load_single_entry(&pts); + if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) { + pts.index = 1; + pt_index_to_va(&pts); + } + (*chk->fn)(chk->test, &pts, chk->fn_arg); + return 0; +} + +/* + * Call fn for each level in the table with a pts setup to index 0 in a table + * for that level. This allows writing tests that run on every level. + * The test can use every index in the table except the last one. + */ +static void check_all_levels(struct kunit *test, + void (*fn)(struct kunit *test, + struct pt_state *pts, void *arg), + void *fn_arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct check_levels_arg chk = { + .test = test, + .fn = fn, + .fn_arg = fn_arg, + }; + int ret; + + if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) && + priv->common->max_vasz_lg2 > range.max_vasz_lg2) + range.last_va = fvalog2_set_mod_max(range.va, + priv->common->max_vasz_lg2); + + /* + * Map a page at the highest VA, this will populate all the levels so we + * can then iterate over them. Index 0 will be used for testing. + */ + if (IS_32BIT && range.max_vasz_lg2 > 32) + range.last_va = (u32)range.last_va; + range.va = range.last_va - (priv->smallest_pgsz - 1); + do_map(test, range.va, 0, priv->smallest_pgsz); + + range = pt_make_range(priv->common, range.va, range.last_va); + ret = pt_walk_range(&range, __check_all_levels, &chk); + KUNIT_ASSERT_EQ(test, ret, 0); +} + +static void test_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + /* Fixture does the setup */ + KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0); +} + +/* + * Basic check that the log2_* functions are working, especially at the integer + * limits. + */ +static void test_bitops(struct kunit *test) +{ + int i; + + KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32); + + KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64); + + KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31); + + KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63); + + for (i = 0; i != 31; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 63; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 32; i++) { + u64 val = get_random_u64(); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)), + log2_to_max_int_t(u32, ffz_t(u32, val))); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)), + log2_to_max_int_t(u64, ffz_t(u64, val))); + } +} + +static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa) +{ + pt_vaddr_t pgsz_lg2; + + /* Brute force the constraints described in pt_compute_best_pgsize() */ + for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) { + if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) && + log2_mod(va, pgsz_lg2) == 0 && + oalog2_mod(oa, pgsz_lg2) == 0 && + va + log2_to_int(pgsz_lg2) - 1 <= last_va && + log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) && + oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)) + return pgsz_lg2; + } + return 0; +} + +/* Check that the bit logic in pt_compute_best_pgsize() works. */ +static void test_best_pgsize(struct kunit *test) +{ + unsigned int a_lg2; + unsigned int b_lg2; + unsigned int c_lg2; + + /* Try random prefixes with every suffix combination */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* 0 prefix, every suffix */ + for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = 0; + pt_oaddr_t oa = 0; + pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2); + + KUNIT_ASSERT_EQ(test, + pt_compute_best_pgsize(pgsz_bitmap, va, last_va, + oa), + ref_best_pgsize(pgsz_bitmap, va, last_va, oa)); + } + + /* 1's prefix, every suffix */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = PT_VADDR_MAX << a_lg2; + pt_oaddr_t oa = PT_VADDR_MAX << b_lg2; + pt_vaddr_t last_va = PT_VADDR_MAX; + + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* pgsize_bitmap is always 0 */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = 0; + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + 0); + } + } + } + + if (sizeof(pt_vaddr_t) <= 4) + return; + + /* over 32 bit page sizes */ + for (a_lg2 = 32; a_lg2 != 42; a_lg2++) { + for (b_lg2 = 32; b_lg2 != 42; b_lg2++) { + for (c_lg2 = 32; c_lg2 != 42; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } +} + +/* + * Check that pt_install_table() and pt_table_pa() match + */ +static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_table(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + pt_load_single_entry(pts); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + /* A second install should pass because install updates pts->entry. */ + KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true); + + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE); + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static void test_table_ptr(struct kunit *test) +{ + check_all_levels(test, test_lvl_table_ptr, NULL); +} + +struct lvl_radix_arg { + pt_vaddr_t vbits; +}; + +/* + * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a + * continuous list of VA across all the levels that covers the entire advertised + * VA space. + */ +static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct lvl_radix_arg *radix = arg; + + /* Every bit below us is decoded */ + KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits); + + /* We are not decoding bits someone else is */ + KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0); + + /* Can't decode past the pt_vaddr_t size */ + KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2); + KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2), + 0); + + radix->vbits = fvalog2_set_mod_max(0, table_lg2sz); +} + +static void test_max_va(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + + KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2); +} + +static void test_table_radix(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 }; + struct pt_range range; + + check_all_levels(test, test_lvl_radix, &radix); + + range = pt_top_range(priv->common); + if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) { + KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX); + } else { + if (!IS_32BIT) + KUNIT_ASSERT_EQ(test, + log2_set_mod_max(0, range.max_vasz_lg2), + radix.vbits); + KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2), + 0); + } +} + +static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts) +{ + struct pt_range top_range = pt_top_range(pts->range->common); + struct pt_state top_pts = pt_init_top(&top_range); + + /* + * Avoid calling pt_num_items_lg2() on the top, instead we can derive + * the size of the top table from the top range. + */ + if (pts->level == top_range.top_level) + return ilog2(pt_range_to_end_index(&top_pts)); + return pt_num_items_lg2(pts); +} + +static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts); + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) { + KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0); + return; + } + + /* No bits for sizes that would be outside this table */ + KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0); + KUNIT_ASSERT_EQ( + test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0); + + /* + * Non contiguous must be supported. AMDv1 has a HW bug where it does + * not support it on one of the levels. + */ + if ((u64)pgsize_bitmap != 0xff0000000000ULL || + strcmp(__stringify(PTPFX_RAW), "amdv1") != 0) + KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2)); + else + KUNIT_ASSERT_NE(test, pgsize_bitmap, 0); + + /* A contiguous entry should not span the whole table */ + if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2) + KUNIT_ASSERT_FALSE( + test, + pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2)); +} + +static void test_entry_possible_sizes(struct kunit *test) +{ + check_all_levels(test, test_lvl_possible_sizes, NULL); +} + +static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts, + struct pt_write_attrs *attrs, + pt_oaddr_t test_oaddr) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + unsigned int len_lg2; + + if (pts->index != 0) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + struct pt_state sub_pts = *pts; + pt_oaddr_t oaddr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(test_oaddr, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, attrs); + /* Verify that every contiguous item translates correctly */ + for (sub_pts.index = 0; + sub_pts.index != log2_to_int(len_lg2 - isz_lg2); + sub_pts.index++) { + KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA); + KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts), + oaddr + sub_pts.index * + oalog2_mul(1, isz_lg2)); + KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr); + KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts), + len_lg2 - isz_lg2); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +/* + * Check that pt_install_leaf_entry() and pt_entry_oa() match. + * Check that pt_clear_entries() works. + */ +static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2; + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + sweep_all_pgsizes(test, pts, &attrs, priv->test_oa); + + /* Check that the table can store the boundary OAs */ + sweep_all_pgsizes(test, pts, &attrs, 0); + if (max_oa_lg2 == PT_OADDR_MAX_LG2) + sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX); + else + sweep_all_pgsizes(test, pts, &attrs, + oalog2_to_max_int(max_oa_lg2)); +} + +static void test_entry_oa(struct kunit *test) +{ + check_all_levels(test, test_lvl_entry_oa, NULL); +} + +/* Test pt_attr_from_entry() */ +static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts, + void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int len_lg2; + unsigned int prot; + + if (!pt_can_have_leaf(pts)) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE | + IOMMU_NOEXEC | IOMMU_MMIO); + prot++) { + pt_oaddr_t oaddr; + struct pt_write_attrs attrs = {}; + u64 good_entry; + + /* + * If the format doesn't support this combination of + * prot bits skip it + */ + if (pt_iommu_set_prot(pts->range->common, &attrs, + prot)) { + /* But RW has to be supported */ + KUNIT_ASSERT_NE(test, prot, + IOMMU_READ | IOMMU_WRITE); + continue; + } + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + good_entry = pts->entry; + + memset(&attrs, 0, sizeof(attrs)); + pt_attr_from_entry(pts, &attrs); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + /* + * The descriptor produced by pt_attr_from_entry() + * produce an identical entry value when re-written + */ + KUNIT_ASSERT_EQ(test, good_entry, pts->entry); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } + } +} + +static void test_attr_from_entry(struct kunit *test) +{ + check_all_levels(test, test_lvl_attr_from_entry, NULL); +} + +static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int start_idx = pts->index; + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ | IOMMU_WRITE)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + pt_oaddr_t oaddr; + unsigned int i; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + pt_load_entry(pts); + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + + for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) { + /* dirty every contiguous entry */ + pts->index = start_idx + i; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts)); + pts->index = start_idx; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts)); + + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } +} + +static __maybe_unused void test_dirty(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!pt_dirty_supported(priv->common)) + kunit_skip(test, + "Page table features do not support dirty tracking"); + + check_all_levels(test, test_lvl_dirty, NULL); +} + +static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2); + struct pt_write_attrs new_attrs = {}; + unsigned int bitnr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + pt_install_leaf_entry(pts, paddr, len_lg2, &attrs); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) { + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr); + + /* SW bits didn't leak into the attrs */ + pt_attr_from_entry(pts, &new_attrs); + KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs)); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +static __maybe_unused void test_sw_bit_leaf(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_leaf, NULL); +} + +static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + unsigned int bitnr; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) { + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static __maybe_unused void test_sw_bit_table(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_table, NULL); +} + +static struct kunit_case generic_pt_test_cases[] = { + KUNIT_CASE_FMT(test_init), + KUNIT_CASE_FMT(test_bitops), + KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_table_ptr), + KUNIT_CASE_FMT(test_max_va), + KUNIT_CASE_FMT(test_table_radix), + KUNIT_CASE_FMT(test_entry_possible_sizes), + KUNIT_CASE_FMT(test_entry_oa), + KUNIT_CASE_FMT(test_attr_from_entry), +#ifdef pt_entry_is_write_dirty + KUNIT_CASE_FMT(test_dirty), +#endif +#ifdef pt_sw_bit + KUNIT_CASE_FMT(test_sw_bit_leaf), + KUNIT_CASE_FMT(test_sw_bit_table), +#endif + {}, +}; + +static int pt_kunit_generic_pt_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_generic_pt_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(generic_pt_suite) = { + .name = __stringify(NS(fmt_test)), + .init = pt_kunit_generic_pt_init, + .exit = pt_kunit_generic_pt_exit, + .test_cases = generic_pt_test_cases, +}; +kunit_test_suites(&NS(generic_pt_suite)); diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h new file mode 100644 index 000000000000..22c9e4c4dd97 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_KUNIT_IOMMU_H +#define __GENERIC_PT_KUNIT_IOMMU_H + +#define GENERIC_PT_KUNIT 1 +#include <kunit/device.h> +#include <kunit/test.h> +#include "../iommu-pages.h" +#include "pt_iter.h" + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp); + +/* The format can provide a list of configurations it would like to test */ +#ifdef kunit_fmt_cfgs +static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t cfg_id = (uintptr_t)prev; + + cfg_id++; + if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1) + return NULL; + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u", + __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1)); + return (void *)cfg_id; +} +#define KUNIT_CASE_FMT(test_name) \ + KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg) +#else +#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name) +#endif + +#define KUNIT_ASSERT_NO_ERRNO(test, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \ + ERR_PTR(ret)) + +#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, \ + KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \ + ERR_PTR(ret), fn) + +/* + * When the test is run on a 32 bit system unsigned long can be 32 bits. This + * cause the iommu op signatures to be restricted to 32 bits. Meaning the test + * has to be mindful not to create any VA's over the 32 bit limit. Reduce the + * scope of the testing as the main purpose of checking on full 32 bit is to + * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to + * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes + */ +#define IS_32BIT (sizeof(unsigned long) == 4) + +struct kunit_iommu_priv { + union { + struct iommu_domain domain; + struct pt_iommu_table fmt_table; + }; + spinlock_t top_lock; + struct device *dummy_dev; + struct pt_iommu *iommu; + struct pt_common *common; + struct pt_iommu_table_cfg cfg; + struct pt_iommu_info info; + unsigned int smallest_pgsz_lg2; + pt_vaddr_t smallest_pgsz; + unsigned int largest_pgsz_lg2; + pt_oaddr_t test_oa; + pt_vaddr_t safe_pgsize_bitmap; + unsigned long orig_nr_secondary_pagetable; + +}; +PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); + +static void pt_kunit_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + iommu_put_pages_list(&gather->freelist); +} + +#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x) +static const struct iommu_domain_ops kunit_pt_ops = { + IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW), + .iotlb_sync = &pt_kunit_iotlb_sync, +}; + +static void pt_kunit_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ +} + +static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table) +{ + struct kunit_iommu_priv *priv = container_of( + iommu_table, struct kunit_iommu_priv, fmt_table.iommu); + + return &priv->top_lock; +} + +static const struct pt_iommu_driver_ops pt_kunit_driver_ops = { + .change_top = &pt_kunit_change_top, + .get_top_lock = &pt_kunit_get_top_lock, +}; + +static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) +{ + unsigned int va_lg2sz; + int ret; + + /* Enough so the memory allocator works */ + priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev"); + if (IS_ERR(priv->dummy_dev)) + return PTR_ERR(priv->dummy_dev); + set_dev_node(priv->dummy_dev, NUMA_NO_NODE); + + spin_lock_init(&priv->top_lock); + +#ifdef kunit_fmt_cfgs + priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1]; + /* + * The format can set a list of features that the kunit_fmt_cfgs + * controls, other features are default to on. + */ + priv->cfg.common.features |= PT_SUPPORTED_FEATURES & + (~KUNIT_FMT_FEATURES); +#else + priv->cfg.common.features = PT_SUPPORTED_FEATURES; +#endif + + /* Defaults, for the kunit */ + if (!priv->cfg.common.hw_max_vasz_lg2) + priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + if (!priv->cfg.common.hw_max_oasz_lg2) + priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL); + + priv->fmt_table.iommu.nid = NUMA_NO_NODE; + priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; + priv->domain.ops = &kunit_pt_ops; + ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); + if (ret) { + if (ret == -EOVERFLOW) + kunit_skip( + test, + "This configuration cannot be tested on 32 bit"); + return ret; + } + + priv->iommu = &priv->fmt_table.iommu; + priv->common = common_from_iommu(&priv->fmt_table.iommu); + priv->iommu->ops->get_info(priv->iommu, &priv->info); + + /* + * size_t is used to pass the mapping length, it can be 32 bit, truncate + * the pagesizes so we don't use large sizes. + */ + priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap; + + priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap); + priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2); + priv->largest_pgsz_lg2 = + vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1; + + priv->test_oa = + oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2); + + /* + * We run out of VA space if the mappings get too big, make something + * smaller that can safely pass through dma_addr_t API. + */ + va_lg2sz = priv->common->max_vasz_lg2; + if (IS_32BIT && va_lg2sz > 32) + va_lg2sz = 32; + priv->safe_pgsize_bitmap = + log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1); + + return 0; +} + +#endif diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h new file mode 100644 index 000000000000..e8a63c8ea850 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "kunit_iommu.h" +#include "pt_iter.h" +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len); + +struct count_valids { + u64 per_size[PT_VADDR_MAX_LG2]; +}; + +static int __count_valids(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct count_valids *valids = arg; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + pt_descend(&pts, arg, __count_valids); + continue; + } + if (pts.type == PT_ENTRY_OA) { + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; + continue; + } + } + return 0; +} + +/* + * Number of valid table entries. This counts contiguous entries as a single + * valid. + */ +static unsigned int count_valids(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) + total += valids.per_size[i]; + return total; +} + +/* Only a single page size is present, count the number of valid entries */ +static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { + if ((1ULL << i) == pgsz) + total = valids.per_size[i]; + else + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); + } + return total; +} + +static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + size_t ret; + + ret = iommu_unmap(&priv->domain, va, len); + KUNIT_ASSERT_EQ(test, ret, len); +} + +static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); + + for (; pfn != end_pfn; pfn++) { + phys_addr_t res = iommu_iova_to_phys(&priv->domain, + pfn * priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); + if (res != pa) + break; + pa += priv->smallest_pgsz; + } +} + +static void test_increase_level(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_common *common = priv->common; + + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); + + if (IS_32BIT) + kunit_skip(test, "Unable to test on 32bit"); + + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + + /* Add every possible level to the max */ + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { + struct pt_range top_range = pt_top_range(common); + + if (top_range.va == 0) + do_map(test, top_range.last_va + 1, 0, + priv->smallest_pgsz); + else + do_map(test, top_range.va - priv->smallest_pgsz, 0, + priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, + top_range.top_level + 1); + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + } +} + +static void test_map_simple(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + pt_vaddr_t cur_va; + + /* Map every reported page size */ + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + + cur_va = ALIGN(cur_va, len); + do_map(test, cur_va, paddr, len); + if (len <= SZ_2G) + check_iova(test, cur_va, paddr, len); + cur_va += len; + } + + /* The read interface reports that every page size was created */ + range = pt_top_range(priv->common); + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + if (pgsize_bitmap & (1ULL << pgsz_lg2)) + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); + else + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); + } + + /* Unmap works */ + range = pt_top_range(priv->common); + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + cur_va = ALIGN(cur_va, len); + do_unmap(test, cur_va, len); + cur_va += len; + } + KUNIT_ASSERT_EQ(test, count_valids(test), 0); +} + +/* + * Test to convert a table pointer into an OA by mapping something small, + * unmapping it so as to leave behind a table pointer, then mapping something + * larger that will convert the table into an OA. + */ +static void test_map_table_to_oa(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t limited_pgbitmap = + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); + struct pt_range range = pt_top_range(priv->common); + unsigned int pgsz_lg2; + pt_vaddr_t max_pgsize; + pt_vaddr_t cur_va; + + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + pt_vaddr_t offset; + + if (!(priv->info.pgsize_bitmap & len)) + continue; + if (len > max_pgsize) + break; + + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, + max_pgsize); + for (offset = 0; offset != max_pgsize; offset += len) + do_map(test, cur_va + offset, paddr + offset, len); + check_iova(test, cur_va, paddr, max_pgsize); + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), + log2_div(max_pgsize, pgsz_lg2)); + + if (len == max_pgsize) { + do_unmap(test, cur_va, max_pgsize); + } else { + do_unmap(test, cur_va, max_pgsize / 2); + for (offset = max_pgsize / 2; offset != max_pgsize; + offset += len) + do_unmap(test, cur_va + offset, len); + } + + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + } +} + +/* + * Test unmapping a small page at the start of a large page. This always unmaps + * the large page. + */ +static void test_unmap_split(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + unsigned int count = 0; + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); + unsigned int next_pgsz_lg2; + + if (!(pgsize_bitmap & base_len)) + continue; + + for (next_pgsz_lg2 = pgsz_lg2 + 1; + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); + pt_vaddr_t vaddr = top_range.va; + pt_oaddr_t paddr = 0; + size_t gnmapped; + + if (!(pgsize_bitmap & next_len)) + continue; + + do_map(test, vaddr, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + /* Make sure unmap doesn't keep going */ + do_map(test, vaddr, paddr, next_len); + do_map(test, vaddr + next_len, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, + next_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + count++; + } + } + + if (count == 0) + kunit_skip(test, "Test needs two page sizes"); +} + +static void unmap_collisions(struct kunit *test, struct maple_tree *mt, + pt_vaddr_t start, pt_vaddr_t last) +{ + struct kunit_iommu_priv *priv = test->priv; + MA_STATE(mas, mt, start, last); + void *entry; + + mtree_lock(mt); + mas_for_each(&mas, entry, last) { + pt_vaddr_t mas_start = mas.index; + pt_vaddr_t len = (mas.last - mas_start) + 1; + pt_oaddr_t paddr; + + mas_erase(&mas); + mas_pause(&mas); + mtree_unlock(mt); + + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); + check_iova(test, mas_start, paddr, len); + do_unmap(test, mas_start, len); + mtree_lock(mt); + } + mtree_unlock(mt); +} + +static void clamp_range(struct kunit *test, struct pt_range *range) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (range->last_va - range->va > SZ_1G) + range->last_va = range->va + SZ_1G; + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); + if (range->va <= MAPLE_RESERVED_RANGE) + range->va = + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); +} + +/* + * Randomly map and unmap ranges that can large physical pages. If a random + * range overlaps with existing ranges then unmap them. This hits all the + * special cases. + */ +static void test_random_map(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range upper_range = pt_upper_range(priv->common); + struct pt_range top_range = pt_top_range(priv->common); + struct maple_tree mt; + unsigned int iter; + + mt_init(&mt); + + /* + * Shrink the range so randomization is more likely to have + * intersections + */ + clamp_range(test, &top_range); + clamp_range(test, &upper_range); + + for (iter = 0; iter != 1000; iter++) { + struct pt_range *range = &top_range; + pt_oaddr_t paddr; + pt_vaddr_t start; + pt_vaddr_t end; + int ret; + + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) + range = &upper_range; + + start = get_random_u32_below( + min(U32_MAX, range->last_va - range->va)); + end = get_random_u32_below( + min(U32_MAX, range->last_va - start)); + + start = ALIGN_DOWN(start, priv->smallest_pgsz); + end = ALIGN(end, priv->smallest_pgsz); + start += range->va; + end += start; + if (start < range->va || end > range->last_va + 1 || + start >= end) + continue; + + /* Try overmapping to test the failure handling */ + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); + ret = iommu_map(&priv->domain, start, paddr, end - start, + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); + if (ret) { + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); + unmap_collisions(test, &mt, start, end - 1); + do_map(test, start, paddr, end - start); + } + + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", + mtree_insert_range(&mt, start, end - 1, + XA_ZERO_ENTRY, + GFP_KERNEL)); + + check_iova(test, start, paddr, end - start); + if (iter % 100) + cond_resched(); + } + + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + + mtree_destroy(&mt); +} + +/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ +static void test_pgsize_boundary(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || + priv->smallest_pgsz != SZ_4K) + kunit_skip(test, "Format does not have the required range"); + + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); +} + +/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ +static void test_mixed(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + pt_oaddr_t oa = start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + do_map(test, start, oa, len); + /* 14 2M, 3 1G, 3 2M */ + KUNIT_ASSERT_EQ(test, count_valids(test), 20); + check_iova(test, start, oa, len); +} + +static struct kunit_case iommu_test_cases[] = { + KUNIT_CASE_FMT(test_increase_level), + KUNIT_CASE_FMT(test_map_simple), + KUNIT_CASE_FMT(test_map_table_to_oa), + KUNIT_CASE_FMT(test_unmap_split), + KUNIT_CASE_FMT(test_random_map), + KUNIT_CASE_FMT(test_pgsize_boundary), + KUNIT_CASE_FMT(test_mixed), + {}, +}; + +static int pt_kunit_iommu_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->orig_nr_secondary_pagetable = + global_node_page_state(NR_SECONDARY_PAGETABLE); + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_iommu_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + /* + * Look for memory leaks, assumes kunit is running isolated and nothing + * else is using secondary page tables. + */ + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, + global_node_page_state(NR_SECONDARY_PAGETABLE)); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(iommu_suite) = { + .name = __stringify(NS(iommu_test)), + .init = pt_kunit_iommu_init, + .exit = pt_kunit_iommu_exit, + .test_cases = iommu_test_cases, +}; +kunit_test_suites(&NS(iommu_suite)); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kunit for generic page table"); +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h new file mode 100644 index 000000000000..e1123d35c907 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_common.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included after the format. It contains definitions + * that build on the format definitions to create the basic format API. + * + * The format API is listed here, with kdocs. The functions without bodies are + * implemented in the format using the pattern: + * static inline FMTpt_XXX(..) {..} + * #define pt_XXX FMTpt_XXX + * + * If the format doesn't implement a function then pt_fmt_defaults.h can provide + * a generic version. + * + * The routines marked "@pts: Entry to query" operate on the entire contiguous + * entry and can be called with a pts->index pointing to any sub item that makes + * up that entry. + * + * The header order is: + * pt_defs.h + * FMT.h + * pt_common.h + */ +#ifndef __GENERIC_PT_PT_COMMON_H +#define __GENERIC_PT_PT_COMMON_H + +#include "pt_defs.h" +#include "pt_fmt_defaults.h" + +/** + * pt_attr_from_entry() - Convert the permission bits back to attrs + * @pts: Entry to convert from + * @attrs: Resulting attrs + * + * Fill in the attrs with the permission bits encoded in the current leaf entry. + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the + * same entry. + */ +static inline void pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs); + +/** + * pt_can_have_leaf() - True if the current level can have an OA entry + * @pts: The current level + * + * True if the current level can support pt_install_leaf_entry(). A leaf + * entry produce an OA. + */ +static inline bool pt_can_have_leaf(const struct pt_state *pts); + +/** + * pt_can_have_table() - True if the current level can have a lower table + * @pts: The current level + * + * Every level except 0 is allowed to have a lower table. + */ +static inline bool pt_can_have_table(const struct pt_state *pts) +{ + /* No further tables at level 0 */ + return pts->level > 0; +} + +/** + * pt_clear_entries() - Make entries empty (non-present) + * @pts: Starting table index + * @num_contig_lg2: Number of contiguous items to clear + * + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY + * and does not have any effect on table walking. The starting index must be + * aligned to num_contig_lg2. + */ +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2); + +/** + * pt_entry_make_write_dirty() - Make an entry dirty + * @pts: Table entry to change + * + * Make pt_entry_is_write_dirty() return true for this entry. This can be called + * asynchronously with any other table manipulation under a RCU lock and must + * not corrupt the table. + */ +static inline bool pt_entry_make_write_dirty(struct pt_state *pts); + +/** + * pt_entry_make_write_clean() - Make the entry write clean + * @pts: Table entry to change + * + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will + * eventually be notified of this change via a TLB flush, which is the point + * that the HW must become synchronized. Any "write dirty" prior to the TLB + * flush can be lost, but once the TLB flush completes all writes must make + * their entries write dirty. + * + * The format should alter the entry in a way that is compatible with any + * concurrent update from HW. The entire contiguous entry is changed. + */ +static inline void pt_entry_make_write_clean(struct pt_state *pts); + +/** + * pt_entry_is_write_dirty() - True if the entry has been written to + * @pts: Entry to query + * + * "write dirty" means that the HW has written to the OA translated + * by this entry. If the entry is contiguous then the consolidated + * "write dirty" for all the items must be returned. + */ +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); + +/** + * pt_dirty_supported() - True if the page table supports dirty tracking + * @common: Page table to query + */ +static inline bool pt_dirty_supported(struct pt_common *common); + +/** + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry + * @pts: Entry to query + * + * Return the number of contiguous items this leaf entry spans. If the entry + * is single item it returns ilog2(1). + */ +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); + +/** + * pt_entry_oa() - Output Address for this leaf entry + * @pts: Entry to query + * + * Return the output address for the start of the entry. If the entry + * is contiguous this returns the same value for each sub-item. I.e.:: + * + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 + * + * See pt_item_oa(). The format should implement one of these two functions + * depending on how it stores the OAs in the table. + */ +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); + +/** + * pt_entry_oa_lg2sz() - Return the size of an OA entry + * @pts: Entry to query + * + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise + * it returns the total VA/OA size of the entire contiguous entry. + */ +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{ + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); +} + +/** + * pt_entry_oa_exact() - Return the complete OA for an entry + * @pts: Entry to query + * + * During iteration the first entry could have a VA with an offset from the + * natural start of the entry. Return the exact OA including the pts's VA + * offset. + */ +static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) +{ + return _pt_entry_oa_fast(pts) | + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); +} + +/** + * pt_full_va_prefix() - The top bits of the VA + * @common: Page table to query + * + * This is usually 0, but some formats have their VA space going downward from + * PT_VADDR_MAX, and will return that instead. This value must always be + * adjusted by struct pt_common max_vasz_lg2. + */ +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); + +/** + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry + * @common: Page table to query + * + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). + * This is useful to create optimized paths for common cases of PAGE_SIZE + * mappings. + */ +static inline bool pt_has_system_page_size(const struct pt_common *common); + +/** + * pt_install_leaf_entry() - Write a leaf entry to the table + * @pts: Table index to change + * @oa: Output Address for this leaf + * @oasz_lg2: Size in VA/OA for this leaf + * @attrs: Attributes to modify the entry + * + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates + * the VA indicated by pts to the given OA. + * + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. + * + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes + * not indicated by pt_possible_sizes() must not be specified. + */ +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs); + +/** + * pt_install_table() - Write a table entry to the table + * @pts: Table index to change + * @table_pa: CPU physical address of the lower table's memory + * @attrs: Attributes to modify the table index + * + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the + * current entry loaded. The pts is updated with the installed entry. + * + * This must not be called if pt_can_have_table() == false. + * + * Returns: true if the table was installed successfully. + */ +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs); + +/** + * pt_item_oa() - Output Address for this leaf item + * @pts: Item to query + * + * Return the output address for this item. If the item is part of a contiguous + * entry it returns the value of the OA for this individual sub item. + * + * See pt_entry_oa(). The format should implement one of these two functions + * depending on how it stores the OA's in the table. + */ +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); + +/** + * pt_load_entry_raw() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Return the type of entry that was loaded. pts->entry will be filled in with + * the entry's content. See pt_load_entry() + */ +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); + +/** + * pt_max_oa_lg2() - Return the maximum OA the table format can hold + * @common: Page table to query + * + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the + * OA. This is the absolute maximum address the table can hold. struct pt_common + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. + */ +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common); + +/** + * pt_num_items_lg2() - Return the number of items in this table level + * @pts: The current level + * + * The number of items in a table level defines the number of bits this level + * decodes from the VA. This function is not called for the top level, + * so it does not need to compute a special value for the top case. The + * result for the top is based on pt_common max_vasz_lg2. + * + * The value is used as part of determining the table indexes via the + * equation:: + * + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) + */ +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); + +/** + * pt_pgsz_lg2_to_level - Return the level that maps the page size + * @common: Page table to query + * @pgsize_lg2: Log2 page size + * + * Returns the table level that will map the given page size. The page + * size must be part of the pt_possible_sizes() for some level. + */ +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2); + +/** + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level + * @pts: The current level + * + * Each level has a list of possible output sizes that can be installed as + * leaf entries. If pt_can_have_leaf() is false returns zero. + * + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating + * that a non-contiguous single item leaf entry is supported. The following + * pt_num_items_lg2() number of bits can be set indicating contiguous entries + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be + * set, contiguous entries cannot span the entire table. + * + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all + * supported sizes in the entire table. + */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); + +/** + * pt_table_item_lg2sz() - Size of a single item entry in this table level + * @pts: The current level + * + * The size of the item specifies how much VA and OA a single item occupies. + * + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous + * entries. + */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +/** + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table + * @pts: The current level + * + * Return the size of VA decoded by the entire table level. + */ +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) +{ + if (pts->range->top_level == pts->level) + return pts->range->max_vasz_lg2; + return min_t(unsigned int, pts->range->common->max_vasz_lg2, + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); +} + +/** + * pt_table_pa() - Return the CPU physical address of the table entry + * @pts: Entry to query + * + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same + * value passed to pt_install_table(). + */ +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); + +/** + * pt_table_ptr() - Return a CPU pointer for a table item + * @pts: Entry to query + * + * Same as pt_table_pa() but returns a CPU pointer. + */ +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) +{ + return __va(pt_table_pa(pts)); +} + +/** + * pt_max_sw_bit() - Return the maximum software bit usable for any level and + * entry + * @common: Page table + * + * The swbit can be passed as bitnr to the other sw_bit functions. + */ +static inline unsigned int pt_max_sw_bit(struct pt_common *common); + +/** + * pt_test_sw_bit_acquire() - Read a software bit in an item + * @pts: Entry to read + * @bitnr: Bit to read + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a test bit and acquire operation. + */ +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_set_sw_bit_release() - Set a software bit in an item + * @pts: Entry to set + * @bitnr: Bit to set + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a set bit and release operation. + */ +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_load_entry() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Set the type of entry that was loaded. pts->entry and pts->table_lower + * will be filled in with the entry's content. + */ +static inline void pt_load_entry(struct pt_state *pts) +{ + pts->type = pt_load_entry_raw(pts); + if (pts->type == PT_ENTRY_TABLE) + pts->table_lower = pt_table_ptr(pts); +} +#endif diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h new file mode 100644 index 000000000000..c25544d72f97 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included before the format. It contains definitions + * that are required to compile the format. The header order is: + * pt_defs.h + * fmt_XX.h + * pt_common.h + */ +#ifndef __GENERIC_PT_DEFS_H +#define __GENERIC_PT_DEFS_H + +#include <linux/generic_pt/common.h> + +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/bits.h> +#include <linux/limits.h> +#include <linux/bug.h> +#include <linux/kconfig.h> +#include "pt_log2.h" + +/* Header self-compile default defines */ +#ifndef pt_write_attrs +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; +#endif + +struct pt_table_p; + +enum { + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, +}; + +/* + * The format instantiation can have features wired off or on to optimize the + * code gen. Supported features are just a reflection of what the current set of + * kernel users want to use. + */ +#ifndef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES 0 +#endif + +/* + * When in debug mode we compile all formats with all features. This allows the + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +enum { + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, + PT_DEBUG_SUPPORTED_FEATURES = + UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? + 0 : + BIT(PT_FEAT_DMA_INCOHERENT))) & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : + BIT(PT_FEAT_SIGN_EXTEND)), +}; +#undef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES +#endif + +#ifndef PT_FORCE_ENABLED_FEATURES +#define PT_FORCE_ENABLED_FEATURES 0 +#endif + +/** + * DOC: Generic Page Table Language + * + * Language used in Generic Page Table + * VA + * The input address to the page table, often the virtual address. + * OA + * The output address from the page table, often the physical address. + * leaf + * An entry that results in an output address. + * start/end + * An half-open range, e.g. [0,0) refers to no VA. + * start/last + * An inclusive closed range, e.g. [0,0] refers to the VA 0 + * common + * The generic page table container struct pt_common + * level + * Level 0 is always a table of only leaves with no futher table pointers. + * Increasing levels increase the size of the table items. The least + * significant VA bits used to index page tables are used to index the Level + * 0 table. The various labels for table levels used by HW descriptions are + * not used. + * top_level + * The inclusive highest level of the table. A two-level table + * has a top level of 1. + * table + * A linear array of translation items for that level. + * index + * The position in a table of an element: item = table[index] + * item + * A single index in a table + * entry + * A single logical element in a table. If contiguous pages are not + * supported then item and entry are the same thing, otherwise entry refers + * to all the items that comprise a single contiguous translation. + * item/entry_size + * The number of bytes of VA the table index translates for. + * If the item is a table entry then the next table covers + * this size. If the entry translates to an output address then the + * full OA is: OA | (VA % entry_size) + * contig_count + * The number of consecutive items fused into a single entry. + * item_size * contig_count is the size of that entry's translation. + * lg2 + * Indicates the value is encoded as log2, i.e. 1<<x is the actual value. + * Normally the compiler is fine to optimize divide and mod with log2 values + * automatically when inlining, however if the values are not constant + * expressions it can't. So we do it by hand; we want to avoid 64-bit + * divmod. + */ + +/* Returned by pt_load_entry() and for_each_pt_level_entry() */ +enum pt_entry_type { + PT_ENTRY_EMPTY, + /* Entry is valid and points to a lower table level */ + PT_ENTRY_TABLE, + /* Entry is valid and returns an output address */ + PT_ENTRY_OA, +}; + +struct pt_range { + struct pt_common *common; + struct pt_table_p *top_table; + pt_vaddr_t va; + pt_vaddr_t last_va; + u8 top_level; + u8 max_vasz_lg2; +}; + +/* + * Similar to xa_state, this records information about an in-progress parse at a + * single level. + */ +struct pt_state { + struct pt_range *range; + struct pt_table_p *table; + struct pt_table_p *table_lower; + u64 entry; + enum pt_entry_type type; + unsigned short index; + unsigned short end_index; + u8 level; +}; + +#define pt_cur_table(pts, type) ((type *)((pts)->table)) + +/* + * Try to install a new table pointer. The locking methodology requires this to + * be atomic (multiple threads can race to install a pointer). The losing + * threads will fail the atomic and return false. They should free any memory + * and reparse the table level again. + */ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{ + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} +#endif + +static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) +{ + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} + +#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) + +static inline bool pt_feature(const struct pt_common *common, + unsigned int feature_nr) +{ + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) + return true; + if (!PT_SUPPORTED_FEATURE(feature_nr)) + return false; + return common->features & BIT(feature_nr); +} + +static inline bool pts_feature(const struct pt_state *pts, + unsigned int feature_nr) +{ + return pt_feature(pts->range->common, feature_nr); +} + +/* + * PT_WARN_ON is used for invariants that the kunit should be checking can't + * happen. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{ + return false; +} +#endif + +/* These all work on the VA type */ +#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) +#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) +#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) +#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) +#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) +#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) +#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) +#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) +#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) +#define vaffs(a) ffs_t(pt_vaddr_t, a) +#define vafls(a) fls_t(pt_vaddr_t, a) +#define vaffz(a) ffz_t(pt_vaddr_t, a) + +/* + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and + * generate a useful defined result. The non-fva versions will malfunction at + * this extreme. + */ +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return 0; + return log2_div_t(pt_vaddr_t, a, b_lg2); +} + +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return a; + return log2_mod_t(pt_vaddr_t, a, b_lg2); +} + +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, + unsigned int c_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) + return true; + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, + unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return val; + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return PT_VADDR_MAX; + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); +} + +/* These all work on the OA type */ +#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) +#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) +#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) +#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) +#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) +#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) +#define oaffs(a) ffs_t(pt_oaddr_t, a) +#define oafls(a) fls_t(pt_oaddr_t, a) +#define oaffz(a) ffz_t(pt_oaddr_t, a) + +static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, + unsigned int top_level) +{ + return top_level | (uintptr_t)table_mem; +} + +static inline void pt_top_set(struct pt_common *common, + struct pt_table_p *table_mem, + unsigned int top_level) +{ + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); +} + +static inline void pt_top_set_level(struct pt_common *common, + unsigned int top_level) +{ + pt_top_set(common, NULL, top_level); +} + +static inline unsigned int pt_top_get_level(const struct pt_common *common) +{ + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); +} + +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2); + +#endif diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 000000000000..69fb7c2314ca --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Default definitions for formats that don't define these functions. + */ +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H + +#include "pt_defs.h" +#include <linux/log2.h> + +/* Header self-compile default defines */ +#ifndef pt_load_entry_raw +#include "fmt/amdv1.h" +#endif + +/* + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. + */ +#ifndef pt_table_item_lg2sz +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) +{ + return PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; +} +#endif + +#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2) +{ + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); +} +#endif + +/* + * If not supplied by the format then contiguous pages are not supported. + * + * If contiguous pages are supported then the format must also provide + * pt_contig_count_lg2() if it supports a single contiguous size per level, + * or pt_possible_sizes() if it supports multiple sizes per level. + */ +#ifndef pt_entry_num_contig_lg2 +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} + +/* + * Return the number of contiguous OA items forming an entry at this table level + */ +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} +#endif + +/* If not supplied by the format then dirty tracking is not supported */ +#ifndef pt_entry_is_write_dirty +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) +{ + return false; +} + +static inline void pt_entry_make_write_clean(struct pt_state *pts) +{ +} + +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return false; +} +#else +/* If not supplied then dirty tracking is always enabled */ +#ifndef pt_dirty_supported +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return true; +} +#endif +#endif + +#ifndef pt_entry_make_write_dirty +static inline bool pt_entry_make_write_dirty(struct pt_state *pts) +{ + return false; +} +#endif + +/* + * Format supplies either: + * pt_entry_oa - OA is at the start of a contiguous entry + * or + * pt_item_oa - OA is adjusted for every item in a contiguous entry + * + * Build the missing one + * + * The internal helper _pt_entry_oa_fast() allows generating + * an efficient pt_entry_oa_exact(), it doesn't care which + * option is selected. + */ +#ifdef pt_entry_oa +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) +{ + return pt_entry_oa(pts) | + log2_mul(pts->index, pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_entry_oa +#endif + +#ifdef pt_item_oa +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) +{ + return log2_set_mod(pt_item_oa(pts), 0, + pt_entry_num_contig_lg2(pts) + + pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_item_oa +#endif + +/* + * If not supplied by the format then use the constant + * PT_MAX_OUTPUT_ADDRESS_LG2. + */ +#ifndef pt_max_oa_lg2 +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common) +{ + return PT_MAX_OUTPUT_ADDRESS_LG2; +} +#endif + +#ifndef pt_has_system_page_size +static inline bool pt_has_system_page_size(const struct pt_common *common) +{ + return PT_GRANULE_LG2SZ == PAGE_SHIFT; +} +#endif + +/* + * If not supplied by the format then assume only one contiguous size determined + * by pt_contig_count_lg2() + */ +#ifndef pt_possible_sizes +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); + +/* Return a bitmap of possible leaf page sizes at this level */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) + return 0; + return log2_to_int(isz_lg2) | + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); +} +#endif + +/* If not supplied by the format then use 0. */ +#ifndef pt_full_va_prefix +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) +{ + return 0; +} +#endif + +/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ +#ifndef pt_clear_entries +static inline void pt_clear_entries64(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries32(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u32 *tablep = pt_cur_table(pts, u32) + pts->index; + u32 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + if (PT_ITEM_WORD_SIZE == sizeof(u32)) + pt_clear_entries32(pts, num_contig_lg2); + else + pt_clear_entries64(pts, num_contig_lg2); +} +#define pt_clear_entries pt_clear_entries +#endif + +/* If not supplied then SW bits are not supported */ +#ifdef pt_sw_bit +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + /* Acquire, pairs with pt_set_sw_bit_release() */ + smp_mb(); + /* For a contiguous entry the sw bit is only stored in the first item. */ + return pts->entry & pt_sw_bit(bitnr); +} +#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) + if (PT_ITEM_WORD_SIZE == sizeof(u64)) { + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + u64 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + return; + } +#endif + if (PT_ITEM_WORD_SIZE == sizeof(u32)) { + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + u32 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + } else + BUILD_BUG(); +} +#define pt_set_sw_bit_release pt_set_sw_bit_release +#else +static inline unsigned int pt_max_sw_bit(struct pt_common *common) +{ + return 0; +} + +extern void __pt_no_sw_bit(void); +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); + return false; +} + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); +} +#endif + +/* + * Format can call in the pt_install_leaf_entry() to check the arguments are all + * aligned correctly. + */ +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) + return false; + +#ifdef pt_possible_sizes + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) + return false; +#else + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) + return false; +#endif + + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) + return false; + return true; +} + +#endif diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h new file mode 100644 index 000000000000..c0d8617cce29 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -0,0 +1,636 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Iterators for Generic Page Table + */ +#ifndef __GENERIC_PT_PT_ITER_H +#define __GENERIC_PT_PT_ITER_H + +#include "pt_common.h" + +#include <linux/errno.h> + +/* + * Use to mangle symbols so that backtraces and the symbol table are + * understandable. Any non-inlined function should get mangled like this. + */ +#define NS(fn) CONCATENATE(PTPFX, fn) + +/** + * pt_check_range() - Validate the range can be iterated + * @range: Range to validate + * + * Check that VA and last_va fall within the permitted range of VAs. If the + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension + * is correct. + */ +static inline int pt_check_range(struct pt_range *range) +{ + pt_vaddr_t prefix; + + PT_WARN_ON(!range->max_vasz_lg2); + + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? + PT_VADDR_MAX : + 0; + } else { + prefix = pt_full_va_prefix(range->common); + } + + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) + return -ERANGE; + return 0; +} + +/** + * pt_index_to_va() - Update range->va to the current pts->index + * @pts: Iteration State + * + * Adjust range->va to match the current index. This is done in a lazy manner + * since computing the VA takes several instructions and is rarely required. + */ +static inline void pt_index_to_va(struct pt_state *pts) +{ + pt_vaddr_t lower_va; + + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, + pt_table_oa_lg2sz(pts)); +} + +/* + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be + * adjusted to the end of the contiguous block if it is currently in the middle. + */ +static inline void _pt_advance(struct pt_state *pts, + unsigned int index_count_lg2) +{ + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, + index_count_lg2); +} + +/** + * pt_entry_fully_covered() - Check if the item or entry is entirely contained + * within pts->range + * @pts: Iteration State + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or + * pt_entry_oa_lg2sz() + * + * Returns: true if the item is fully enclosed by the pts->range. + */ +static inline bool pt_entry_fully_covered(const struct pt_state *pts, + unsigned int oasz_lg2) +{ + struct pt_range *range = pts->range; + + /* Range begins at the start of the entry */ + if (log2_mod(pts->range->va, oasz_lg2)) + return false; + + /* Range ends past the end of the entry */ + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) + return true; + + /* Range ends at the end of the entry */ + return log2_mod_eq_max(range->last_va, oasz_lg2); +} + +/** + * pt_range_to_index() - Starting index for an iteration + * @pts: Iteration State + * + * Return: the starting index for the iteration in pts. + */ +static inline unsigned int pt_range_to_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + PT_WARN_ON(pts->level > pts->range->top_level); + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->va, + pts->range->max_vasz_lg2), + isz_lg2); + return log2_mod(log2_div(pts->range->va, isz_lg2), + pt_num_items_lg2(pts)); +} + +/** + * pt_range_to_end_index() - Ending index iteration + * @pts: Iteration State + * + * Return: the last index for the iteration in pts. + */ +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_range *range = pts->range; + unsigned int num_entries_lg2; + + if (range->va == range->last_va) + return pts->index + 1; + + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->last_va, + pts->range->max_vasz_lg2), + isz_lg2) + + 1; + + num_entries_lg2 = pt_num_items_lg2(pts); + + /* last_va falls within this table */ + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) + return log2_mod(log2_div(pts->range->last_va, isz_lg2), + num_entries_lg2) + + 1; + + return log2_to_int(num_entries_lg2); +} + +static inline void _pt_iter_first(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pts->end_index = pt_range_to_end_index(pts); + PT_WARN_ON(pts->index > pts->end_index); +} + +static inline bool _pt_iter_load(struct pt_state *pts) +{ + if (pts->index >= pts->end_index) + return false; + pt_load_entry(pts); + return true; +} + +/** + * pt_next_entry() - Advance pts to the next entry + * @pts: Iteration State + * + * Update pts to go to the next index at this level. If pts is pointing at a + * contiguous entry then the index may advance my more than one. + */ +static inline void pt_next_entry(struct pt_state *pts) +{ + if (pts->type == PT_ENTRY_OA && + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); + else + pts->index++; + pt_index_to_va(pts); +} + +/** + * for_each_pt_level_entry() - For loop wrapper over entries in the range + * @pts: Iteration State + * + * This is the basic iteration primitive. It iterates over all the entries in + * pts->range that fall within the pts's current table level. Each step does + * pt_load_entry(pts). + */ +#define for_each_pt_level_entry(pts) \ + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) + +/** + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker + * @pts: Iteration State + * + * Alternative to for_each_pt_level_entry() if the walker function uses only a + * single entry. + */ +static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pt_load_entry(pts); + return pts->type; +} + +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = { + .common = common, + .top_table = + (struct pt_table_p *)(top_of_table & + ~(uintptr_t)PT_TOP_LEVEL_MASK), + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), + }; + struct pt_state pts = { .range = &range, .level = range.top_level }; + unsigned int max_vasz_lg2; + + max_vasz_lg2 = common->max_vasz_lg2; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + pts.level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, + pt_num_items_lg2(&pts) + + pt_table_item_lg2sz(&pts)); + + /* + * The top range will default to the lower region only with sign extend. + */ + range.max_vasz_lg2 = max_vasz_lg2; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) + max_vasz_lg2--; + + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); + range.last_va = + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); + return range; +} + +/** + * pt_top_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static __always_inline struct pt_range pt_top_range(struct pt_common *common) +{ + /* + * The top pointer can change without locking. We capture the value and + * it's level here and are safe to walk it so long as both values are + * captured without tearing. + */ + return _pt_top_range(common, READ_ONCE(common->top_of_table)); +} + +/** + * pt_all_range() - Return a range that spans the entire page table + * @common: Table + * + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND + * is supported range->va and range->last_va will be incorrect during the + * iteration and must not be accessed. + */ +static inline struct pt_range pt_all_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + /* + * Pretend the table is linear from 0 without a sign extension. This + * generates the correct indexes for iteration. + */ + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); + return range; +} + +/** + * pt_upper_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static inline struct pt_range pt_upper_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); + range.last_va = PT_VADDR_MAX; + return range; +} + +/** + * pt_make_range() - Return a range that spans part of the table + * @common: Table + * @va: Start address + * @last_va: Last address + * + * The caller must validate the range with pt_check_range() before using it. + */ +static __always_inline struct pt_range +pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) +{ + struct pt_range range = + _pt_top_range(common, READ_ONCE(common->top_of_table)); + + range.va = va; + range.last_va = last_va; + + return range; +} + +/* + * Span a slice of the table starting at a lower table level from an active + * walk. + */ +static __always_inline struct pt_range +pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, + pt_vaddr_t last_va) +{ + struct pt_range range = *parent; + + range.va = va; + range.last_va = last_va; + + PT_WARN_ON(last_va < va); + PT_WARN_ON(pt_check_range(&range)); + + return range; +} + +/** + * pt_init() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * @level: Table level for the state + * @table: Pointer to the table memory at level + * + * Helper to initialize the on-stack pt_state from walker arguments. + */ +static __always_inline struct pt_state +pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = { + .range = range, + .table = table, + .level = level, + }; + return pts; +} + +/** + * pt_init_top() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * + * The pt_state points to the top most level. + */ +static __always_inline struct pt_state pt_init_top(struct pt_range *range) +{ + return pt_init(range, range->top_level, range->top_table); +} + +typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table); + +/** + * pt_descend() - Recursively invoke the walker for the lower level + * @pts: Iteration State + * @arg: Value to pass to the function + * @fn: Walker function to call + * + * pts must point to a table item. Invoke fn as a walker on the table + * pts points to. + */ +static __always_inline int pt_descend(struct pt_state *pts, void *arg, + pt_level_fn_t fn) +{ + int ret; + + if (PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); + return ret; +} + +/** + * pt_walk_range() - Walk over a VA range + * @range: Range pointer + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * Walk over a VA range. The caller should have done a validity check, at + * least calling pt_check_range(), when building range. The walk will + * start at the top most table. + */ +static __always_inline int pt_walk_range(struct pt_range *range, + pt_level_fn_t fn, void *arg) +{ + return fn(range, arg, range->top_level, range->top_table); +} + +/* + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower + * level + * @pts: Iteration State + * @va: Start address + * @last_va: Last address + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over a slice of the + * lower table. The caller must ensure that va/last_va are within the table + * item. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int pt_walk_descend(const struct pt_state *pts, + pt_vaddr_t va, pt_vaddr_t last_va, + pt_level_fn_t fn, void *arg) +{ + struct pt_range range = pt_make_child_range(pts->range, va, last_va); + + if (PT_WARN_ON(!pt_can_have_table(pts)) || + PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + return fn(&range, arg, pts->level - 1, pts->table_lower); +} + +/* + * pt_walk_descend_all() - Recursively invoke the walker for a table item + * @parent_pts: Iteration State + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over the entire lower + * table. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, + void *arg) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); + + return pt_walk_descend(parent_pts, + log2_set_mod(parent_pts->range->va, 0, isz_lg2), + log2_set_mod_max(parent_pts->range->va, isz_lg2), + fn, arg); +} + +/** + * pt_range_slice() - Return a range that spans indexes + * @pts: Iteration State + * @start_index: Starting index within pts + * @end_index: Ending index within pts + * + * Create a range than spans an index range of the current table level + * pt_state points at. + */ +static inline struct pt_range pt_range_slice(const struct pt_state *pts, + unsigned int start_index, + unsigned int end_index) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + pt_vaddr_t last_va; + pt_vaddr_t va; + + va = fvalog2_set_mod(pts->range->va, + log2_mul(start_index, pt_table_item_lg2sz(pts)), + table_lg2sz); + last_va = fvalog2_set_mod( + pts->range->va, + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); + return pt_make_child_range(pts->range, va, last_va); +} + +/** + * pt_top_memsize_lg2() + * @common: Table + * @top_of_table: Top of table value from _pt_top_set() + * + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this + * will compute the top size assuming the table will grow. + */ +static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = _pt_top_range(common, top_of_table); + struct pt_state pts = pt_init_top(&range); + unsigned int num_items_lg2; + + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); + if (range.top_level != PT_MAX_TOP_LEVEL && + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); + + /* Round up the allocation size to the minimum alignment */ + return max(ffs_t(u64, PT_TOP_PHYS_MASK), + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); +} + +/** + * pt_compute_best_pgsize() - Determine the best page size for leaf entries + * @pgsz_bitmap: Permitted page sizes + * @va: Starting virtual address for the leaf entry + * @last_va: Last virtual address for the leaf entry, sets the max page size + * @oa: Starting output address for the leaf entry + * + * Compute the largest page size for va, last_va, and oa together and return it + * in lg2. The largest page size depends on the format's supported page sizes at + * this level, and the relative alignment of the VA and OA addresses. 0 means + * the OA cannot be stored with the provided pgsz_bitmap. + */ +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, + pt_vaddr_t va, + pt_vaddr_t last_va, + pt_oaddr_t oa) +{ + unsigned int best_pgsz_lg2; + unsigned int pgsz_lg2; + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t mask; + + if (PT_WARN_ON(va >= last_va)) + return 0; + + /* + * Given a VA/OA pair the best page size is the largest page size + * where: + * + * 1) VA and OA start at the page. Bitwise this is the count of least + * significant 0 bits. + * This also implies that last_va/oa has the same prefix as va/oa. + */ + mask = va | oa; + + /* + * 2) The page size is not larger than the last_va (length). Since page + * sizes are always power of two this can't be larger than the + * largest power of two factor of the length. + */ + mask |= log2_to_int(vafls(len) - 1); + + best_pgsz_lg2 = vaffs(mask); + + /* Choose the highest bit <= best_pgsz_lg2 */ + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); + + pgsz_lg2 = vafls(pgsz_bitmap); + if (!pgsz_lg2) + return 0; + + pgsz_lg2--; + + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + PT_WARN_ON( + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + return pgsz_lg2; +} + +#define _PT_MAKE_CALL_LEVEL(fn) \ + static __always_inline int fn(struct pt_range *range, void *arg, \ + unsigned int level, \ + struct pt_table_p *table) \ + { \ + static_assert(PT_MAX_TOP_LEVEL <= 5); \ + if (level == 0) \ + return CONCATENATE(fn, 0)(range, arg, 0, table); \ + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ + return CONCATENATE(fn, 1)(range, arg, 1, table); \ + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ + return CONCATENATE(fn, 2)(range, arg, 2, table); \ + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ + return CONCATENATE(fn, 3)(range, arg, 3, table); \ + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ + return CONCATENATE(fn, 4)(range, arg, 4, table); \ + return CONCATENATE(fn, 5)(range, arg, 5, table); \ + } + +static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, + unsigned int unused_level, + struct pt_table_p *table) +{ + static_assert(PT_MAX_TOP_LEVEL <= 5); + return -EPROTOTYPE; +} + +#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ + static inline int fn(struct pt_range *range, void *arg, \ + unsigned int unused_level, \ + struct pt_table_p *table) \ + { \ + return do_fn(range, arg, level, table, descend_fn); \ + } + +/** + * PT_MAKE_LEVELS() - Build an unwound walker + * @fn: Name of the walker function + * @do_fn: Function to call at each level + * + * This builds a function call tree that can be fully inlined. + * The caller must provide a function body in an __always_inline function:: + * + * static __always_inline int do_fn(struct pt_range *range, void *arg, + * unsigned int level, struct pt_table_p *table, + * pt_level_fn_t descend_fn) + * + * An inline function will be created for each table level that calls do_fn with + * a compile time constant for level and a pointer to the next lower function. + * This generates an optimally inlined walk where each of the functions sees a + * constant level and can codegen the exact constants/etc for that level. + * + * Note this can produce a lot of code! + */ +#define PT_MAKE_LEVELS(fn, do_fn) \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ + do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ + _PT_MAKE_CALL_LEVEL(fn) + +#endif diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h new file mode 100644 index 000000000000..6dbbed119238 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_log2.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Helper macros for working with log2 values + * + */ +#ifndef __GENERIC_PT_LOG2_H +#define __GENERIC_PT_LOG2_H +#include <linux/bitops.h> +#include <linux/limits.h> + +/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1); + +/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) + +/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1); + +/* + * Compute: + * a / c == b / c + * aka the high bits are equal + */ +#define log2_div_eq_t(type, a, b, c_lg2) \ + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) +static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); + +/* Compute a % b */ +#define log2_mod_t(type, a, b_lg2) \ + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) +static_assert(log2_mod_t(unsigned int, 1, 2) == 1); + +/* + * Compute: + * a % b == b - 1 + * aka the low bits are all 1s + */ +#define log2_mod_eq_max_t(type, a, b_lg2) \ + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) +static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); + +/* + * Return a value such that: + * a / b == ret / b + * ret % b == val + * aka set the low bits to val. val must be < b + */ +#define log2_set_mod_t(type, a, val, b_lg2) \ + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) +static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); + +/* Return a value such that: + * a / b == ret / b + * ret % b == b - 1 + * aka set the low bits to all 1s + */ +#define log2_set_mod_max_t(type, a, b_lg2) \ + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) +static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); + +/* Compute a * b */ +#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) +static_assert(log2_mul_t(unsigned int, 2, 2) == 8); + +#define _dispatch_sz(type, fn, a) \ + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) + +/* + * Return the highest value such that: + * fls_t(u32, 0) == 0 + * fls_t(u3, 1) == 1 + * a >= log2_to_int(ret - 1) + * aka find last set bit + */ +static inline unsigned int fls32(u32 a) +{ + return fls(a); +} +#define fls_t(type, a) _dispatch_sz(type, fls, a) + +/* + * Return the highest value such that: + * ffs_t(u32, 0) == UNDEFINED + * ffs_t(u32, 1) == 0 + * log_mod(a, ret) == 0 + * aka find first set bit + */ +static inline unsigned int __ffs32(u32 a) +{ + return __ffs(a); +} +#define ffs_t(type, a) _dispatch_sz(type, __ffs, a) + +/* + * Return the highest value such that: + * ffz_t(u32, U32_MAX) == UNDEFINED + * ffz_t(u32, 0) == 0 + * ffz_t(u32, 1) == 1 + * log_mod(a, ret) == log_to_max_int(ret) + * aka find first zero bit + */ +static inline unsigned int ffz32(u32 a) +{ + return ffz(a); +} +static inline unsigned int ffz64(u64 a) +{ + if (sizeof(u64) == sizeof(unsigned long)) + return ffz(a); + + if ((u32)a == U32_MAX) + return ffz32(a >> 32) + 32; + return ffz32(a); +} +#define ffz_t(type, a) _dispatch_sz(type, ffz, a) + +#endif diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f2f538c70650..5471f814e073 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -13,6 +13,10 @@ config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && X86 select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_PT_VTDSS select IOMMU_IOVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD @@ -66,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON config INTEL_IOMMU_FLOPPY_WA def_bool y - depends on X86 + depends on X86 && BLK_DEV_FD help Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..4e888867e85c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -45,16 +45,9 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) -#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) - -/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR - to match. That way, we can use 'unsigned long' for PFNs with impunity. */ -#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ - __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) -#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); static int rwbf_quirk; #define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) @@ -217,7 +210,6 @@ static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; -static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) -{ - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - - return !(addr_width < BITS_PER_LONG && pfn >> addr_width); -} - /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of @@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -/* Return the super pagesize bitmap if supported. */ -static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) -{ - unsigned long bitmap = 0; - - /* - * 1-level super page supports page size of 2MiB, 2-level super page - * supports page size of both 2MiB and 1GiB. - */ - if (domain->iommu_superpage == 1) - bitmap |= SZ_2M; - else if (domain->iommu_superpage == 2) - bitmap |= SZ_2M | SZ_1G; - - return bitmap; -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -556,13 +524,6 @@ out: return iommu; } -static void domain_flush_cache(struct dmar_domain *domain, - void *addr, int size) -{ - if (!domain->iommu_coherency) - clflush_cache_range(addr, size); -} - static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; @@ -707,280 +668,6 @@ pgtable_walk: } #endif -static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int *target_level, - gfp_t gfp) -{ - struct dma_pte *parent, *pte; - int level = agaw_to_level(domain->agaw); - int offset; - - if (!domain_pfn_supported(domain, pfn)) - /* Address beyond IOMMU's addressing capabilities. */ - return NULL; - - parent = domain->pgd; - - while (1) { - void *tmp_page; - - offset = pfn_level_offset(pfn, level); - pte = &parent[offset]; - if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) - break; - if (level == *target_level) - break; - - if (!dma_pte_present(pte)) { - uint64_t pteval, tmp; - - tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp, - SZ_4K); - - if (!tmp_page) - return NULL; - - domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | - DMA_PTE_WRITE; - if (domain->use_first_level) - pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - - tmp = 0ULL; - if (!try_cmpxchg64(&pte->val, &tmp, pteval)) - /* Someone else set it while we were thinking; use theirs. */ - iommu_free_pages(tmp_page); - else - domain_flush_cache(domain, pte, sizeof(*pte)); - } - if (level == 1) - break; - - parent = phys_to_virt(dma_pte_addr(pte)); - level--; - } - - if (!*target_level) - *target_level = level; - - return pte; -} - -/* return address's pte at specific level */ -static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, - unsigned long pfn, - int level, int *large_page) -{ - struct dma_pte *parent, *pte; - int total = agaw_to_level(domain->agaw); - int offset; - - parent = domain->pgd; - while (level <= total) { - offset = pfn_level_offset(pfn, total); - pte = &parent[offset]; - if (level == total) - return pte; - - if (!dma_pte_present(pte)) { - *large_page = total; - break; - } - - if (dma_pte_superpage(pte)) { - *large_page = total; - return pte; - } - - parent = phys_to_virt(dma_pte_addr(pte)); - total--; - } - return NULL; -} - -/* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned int large_page; - struct dma_pte *first_pte, *pte; - - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - do { - large_page = 1; - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); - if (!pte) { - start_pfn = align_to_level(start_pfn + 1, large_page + 1); - continue; - } - do { - dma_clear_pte(pte); - start_pfn += lvl_to_nr_pages(large_page); - pte++; - } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); - - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - - } while (start_pfn && start_pfn <= last_pfn); -} - -static void dma_pte_free_level(struct dmar_domain *domain, int level, - int retain_level, struct dma_pte *pte, - unsigned long pfn, unsigned long start_pfn, - unsigned long last_pfn) -{ - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn; - struct dma_pte *level_pte; - - if (!dma_pte_present(pte) || dma_pte_superpage(pte)) - goto next; - - level_pfn = pfn & level_mask(level); - level_pte = phys_to_virt(dma_pte_addr(pte)); - - if (level > 2) { - dma_pte_free_level(domain, level - 1, retain_level, - level_pte, level_pfn, start_pfn, - last_pfn); - } - - /* - * Free the page table if we're below the level we want to - * retain and the range covers the entire table. - */ - if (level < retain_level && !(start_pfn > level_pfn || - last_pfn < level_pfn + level_size(level) - 1)) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - iommu_free_pages(level_pte); - } -next: - pfn += level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); -} - -/* - * clear last level (leaf) ptes and free page table pages below the - * level we wish to keep intact. - */ -static void dma_pte_free_pagetable(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn, - int retain_level) -{ - dma_pte_clear_range(domain, start_pfn, last_pfn); - - /* We don't need lock here; nobody else touches the iova range */ - dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, - domain->pgd, 0, start_pfn, last_pfn); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_free_pages(domain->pgd); - domain->pgd = NULL; - } -} - -/* When a page at a given level is being unlinked from its parent, we don't - need to *modify* it at all. All we need to do is make a list of all the - pages which can be freed just as soon as we've flushed the IOTLB and we - know the hardware page-walk will no longer touch them. - The 'pte' argument is the *parent* PTE, pointing to the page that is to - be freed. */ -static void dma_pte_list_pagetables(struct dmar_domain *domain, - int level, struct dma_pte *parent_pte, - struct iommu_pages_list *freelist) -{ - struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte)); - - iommu_pages_list_add(freelist, pte); - - if (level == 1) - return; - - do { - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - pte++; - } while (!first_pte_in_page(pte)); -} - -static void dma_pte_clear_level(struct dmar_domain *domain, int level, - struct dma_pte *pte, unsigned long pfn, - unsigned long start_pfn, unsigned long last_pfn, - struct iommu_pages_list *freelist) -{ - struct dma_pte *first_pte = NULL, *last_pte = NULL; - - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn = pfn & level_mask(level); - - if (!dma_pte_present(pte)) - goto next; - - /* If range covers entire pagetable, free it */ - if (start_pfn <= level_pfn && - last_pfn >= level_pfn + level_size(level) - 1) { - /* These suborbinate page tables are going away entirely. Don't - bother to clear them; we're just going to *free* them. */ - if (level > 1 && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - - dma_clear_pte(pte); - if (!first_pte) - first_pte = pte; - last_pte = pte; - } else if (level > 1) { - /* Recurse down into a level that isn't *entirely* obsolete */ - dma_pte_clear_level(domain, level - 1, - phys_to_virt(dma_pte_addr(pte)), - level_pfn, start_pfn, last_pfn, - freelist); - } -next: - pfn = level_pfn + level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); - - if (first_pte) - domain_flush_cache(domain, first_pte, - (void *)++last_pte - (void *)first_pte); -} - -/* We can't just free the pages because the IOMMU may still be walking - the page tables, and may have cached the intermediate levels. The - pages can only be freed after the IOTLB flush has been done. */ -static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, - unsigned long last_pfn, - struct iommu_pages_list *freelist) -{ - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - dma_pte_clear_level(domain, agaw_to_level(domain->agaw), - domain->pgd, 0, start_pfn, last_pfn, freelist); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_pages_list_add(freelist, domain->pgd); - domain->pgd = NULL; - } -} - /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { @@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; - struct dma_pte *pgd = domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; struct context_entry *context; int ret; if (WARN_ON(!intel_domain_is_ss_paging(domain))) return -EINVAL; + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, else translation = CONTEXT_TT_MULTI_LEVEL; - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, domain->agaw); + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1537,177 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) return 0; } -/* Return largest possible superpage level for a given mapping */ -static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phy_pfn, unsigned long pages) -{ - int support, level = 1; - unsigned long pfnmerge; - - support = domain->iommu_superpage; - - /* To use a large page, the virtual *and* physical addresses - must be aligned to 2MiB/1GiB/etc. Lower bits set in either - of them will mean we have to use smaller pages. So just - merge them and check both at once. */ - pfnmerge = iov_pfn | phy_pfn; - - while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { - pages >>= VTD_STRIDE_SHIFT; - if (!pages) - break; - pfnmerge >>= VTD_STRIDE_SHIFT; - level++; - support--; - } - return level; -} - -/* - * Ensure that old small page tables are removed to make room for superpage(s). - * We're going to add new large pages, so make sure we don't remove their parent - * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. - */ -static void switch_to_super_page(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long end_pfn, int level) -{ - unsigned long lvl_pages = lvl_to_nr_pages(level); - struct dma_pte *pte = NULL; - - if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || - !IS_ALIGNED(end_pfn + 1, lvl_pages))) - return; - - while (start_pfn <= end_pfn) { - if (!pte) - pte = pfn_to_dma_pte(domain, start_pfn, &level, - GFP_ATOMIC); - - if (dma_pte_present(pte)) { - dma_pte_free_pagetable(domain, start_pfn, - start_pfn + lvl_pages - 1, - level + 1); - - cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, - end_pfn << VTD_PAGE_SHIFT, 0); - } - - pte++; - start_pfn += lvl_pages; - if (first_pte_in_page(pte)) - pte = NULL; - } -} - -static int -__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot, - gfp_t gfp) -{ - struct dma_pte *first_pte = NULL, *pte = NULL; - unsigned int largepage_lvl = 0; - unsigned long lvl_pages = 0; - phys_addr_t pteval; - u64 attr; - - if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) - return -EINVAL; - - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) - return -EINVAL; - - if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { - pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); - return -EINVAL; - } - - attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); - if (domain->use_first_level) { - attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (prot & DMA_PTE_WRITE) - attr |= DMA_FL_PTE_DIRTY; - } - - domain->has_mappings = true; - - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; - - while (nr_pages > 0) { - uint64_t tmp; - - if (!pte) { - largepage_lvl = hardware_largepage_caps(domain, iov_pfn, - phys_pfn, nr_pages); - - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, - gfp); - if (!pte) - return -ENOMEM; - first_pte = pte; - - lvl_pages = lvl_to_nr_pages(largepage_lvl); - - /* It is large page*/ - if (largepage_lvl > 1) { - unsigned long end_pfn; - unsigned long pages_to_remove; - - pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, - round_down(nr_pages, lvl_pages), - nr_pte_to_next_page(pte) * lvl_pages); - end_pfn = iov_pfn + pages_to_remove - 1; - switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); - } else { - pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; - } - - } - /* We don't need lock here, nobody else - * touches the iova range - */ - tmp = 0ULL; - if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { - static int dumps = 5; - pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", - iov_pfn, tmp, (unsigned long long)pteval); - if (dumps) { - dumps--; - debug_dma_dump_mappings(NULL); - } - WARN_ON(1); - } - - nr_pages -= lvl_pages; - iov_pfn += lvl_pages; - phys_pfn += lvl_pages; - pteval += lvl_pages * VTD_PAGE_SIZE; - - /* If the next PTE would be the first in a new page, then we - * need to flush the cache on the entries we've just written. - * And then we'll need to recalculate 'pte', so clear it and - * let it get set again in the if (!pte) block above. - * - * If we're done (!nr_pages) we need to flush the cache too. - * - * Also if we've been setting superpages, we may need to - * recalculate 'pte' and switch back to smaller pages for the - * end of the mapping, if the trailing size is not enough to - * use another superpage (i.e. nr_pages < lvl_pages). - */ - pte++; - if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - } - - return 0; -} - static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) { struct intel_iommu *iommu = info->iommu; @@ -1769,22 +1287,26 @@ static int domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct iommu_domain *old) { - struct dma_pte *pgd = domain->pgd; - int level, flags = 0; + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; - level = agaw_to_level(domain->agaw); - if (level != 4 && level != 5) + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) return -EINVAL; - if (level == 5) + if (pt_info.levels == 5) flags |= PASID_FLAG_FL5LP; if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; + return __domain_setup_first_level(iommu, dev, pasid, domain_id_iommu(domain, iommu), - __pa(pgd), flags, old); + pt_info.gcr3_pt, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, @@ -3230,7 +2752,8 @@ void device_block_translation(struct device *dev) } static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -3251,23 +2774,9 @@ static struct iommu_domain blocking_domain = { } }; -static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) -{ - if (!intel_iommu_superpage) - return 0; - - if (first_stage) - return cap_fl1gp_support(iommu->cap) ? 2 : 1; - - return fls(cap_super_page_val(iommu->cap)); -} - -static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) +static struct dmar_domain *paging_domain_alloc(void) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; struct dmar_domain *domain; - int addr_width; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) @@ -3282,56 +2791,38 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st INIT_LIST_HEAD(&domain->s1_domains); spin_lock_init(&domain->s1_lock); - domain->nid = dev_to_node(dev); - domain->use_first_level = first_stage; - - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - - /* calculate the address width */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); - domain->gaw = addr_width; - domain->agaw = iommu->agaw; - domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); - - /* iommu memory access coherency */ - domain->iommu_coherency = iommu_paging_structure_coherency(iommu); + return domain; +} - /* pagesize bitmap */ - domain->domain.pgsize_bitmap = SZ_4K; - domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); /* - * IOVA aperture: First-level translation restricts the input-address - * to a canonical address (i.e., address bits 63:N have the same value - * as address bit [N-1], where N is 48-bits with 4-level paging and - * 57-bits with 5-level paging). Hence, skip bit [N-1]. + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). */ - domain->domain.geometry.force_aperture = true; - domain->domain.geometry.aperture_start = 0; - if (first_stage) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K); - if (!domain->pgd) { - kfree(domain); - return ERR_PTR(-ENOMEM); + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); } - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return domain; + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); } static struct iommu_domain * intel_iommu_domain_alloc_first_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { + struct pt_iommu_x86_64_cfg cfg = {}; struct dmar_domain *dmar_domain; + int ret; if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); @@ -3340,10 +2831,20 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); - dmar_domain = paging_domain_alloc(dev, true); + dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* * iotlb sync for map is only needed for legacy implementations that @@ -3353,14 +2854,58 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, if (rwbf_required(iommu)) dmar_domain->iotlb_sync_map = true; + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + return &dmar_domain->domain; } +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; +} + +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + static struct iommu_domain * intel_iommu_domain_alloc_second_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { + struct pt_iommu_vtdss_cfg cfg = {}; struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | @@ -3377,15 +2922,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev, if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); - dmar_domain = paging_domain_alloc(dev, false); + dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); + + /* + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) + */ + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); + + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_ss_paging_domain_ops; dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) - dmar_domain->domain.dirty_ops = &intel_dirty_ops; + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; /* * Besides the internal write buffer flush, the caching mode used for @@ -3427,14 +3003,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) if (WARN_ON(!list_empty(&dmar_domain->devices))) return; - if (dmar_domain->pgd) { - struct iommu_pages_list freelist = - IOMMU_PAGES_LIST_INIT(freelist); - - domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw), - &freelist); - iommu_put_pages_list(&freelist); - } + pt_iommu_deinit(&dmar_domain->iommu); kfree(dmar_domain->qi_batch); kfree(dmar_domain); @@ -3451,6 +3020,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return -EINVAL; + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; + /* Same page size support */ if (!cap_fl1gp_support(iommu->cap) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) @@ -3467,7 +3046,11 @@ static int paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, struct intel_iommu *iommu) { + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; + + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) return -EINVAL; @@ -3478,6 +3061,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return -EINVAL; + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; + + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + /* Same page size support */ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) return -EINVAL; @@ -3489,6 +3085,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, !dmar_domain->iotlb_sync_map) return -EINVAL; + /* + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock + */ + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; return 0; } @@ -3498,7 +3102,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; - int addr_width; if (intel_domain_is_fs_paging(dmar_domain)) ret = paging_domain_compatible_first_stage(dmar_domain, iommu); @@ -3509,26 +3112,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) if (ret) return ret; - /* - * FIXME this is locked wrong, it needs to be under the - * dmar_domain->lock - */ - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EINVAL; - - if (dmar_domain->iommu_coherency != - iommu_paging_structure_coherency(iommu)) - return -EINVAL; - - - /* check if this iommu agaw is sufficient for max mapped address */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); - - if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) - return -EINVAL; - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) return intel_pasid_setup_sm_context(dev); @@ -3537,7 +3120,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) } static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret; @@ -3558,110 +3142,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return ret; } -static int intel_iommu_map(struct iommu_domain *domain, - unsigned long iova, phys_addr_t hpa, - size_t size, int iommu_prot, gfp_t gfp) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - u64 max_addr; - int prot = 0; - - if (iommu_prot & IOMMU_READ) - prot |= DMA_PTE_READ; - if (iommu_prot & IOMMU_WRITE) - prot |= DMA_PTE_WRITE; - if (dmar_domain->set_pte_snp) - prot |= DMA_PTE_SNP; - - max_addr = iova + size; - if (dmar_domain->max_addr < max_addr) { - u64 end; - - /* check if minimum agaw is sufficient for mapped address */ - end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; - if (end < max_addr) { - pr_err("%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, dmar_domain->gaw, max_addr); - return -EFAULT; - } - dmar_domain->max_addr = max_addr; - } - /* Round up size to next multiple of PAGE_SIZE, if it and - the low bits of hpa would take us onto the next page */ - size = aligned_nrpages(hpa, size); - return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, - hpa >> VTD_PAGE_SHIFT, size, prot, gfp); -} - -static int intel_iommu_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - int ret; - - if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) - return -EINVAL; - - if (!IS_ALIGNED(iova | paddr, pgsize)) - return -EINVAL; - - ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); - if (!ret && mapped) - *mapped = size; - - return ret; -} - -static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long start_pfn, last_pfn; - int level = 0; - - /* Cope with horrid API which requires us to unmap more than the - size argument if it happens to be a large-page mapping. */ - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, - &level, GFP_ATOMIC))) - return 0; - - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) - size = VTD_PAGE_SIZE << level_to_offset_bits(level); - - start_pfn = iova >> VTD_PAGE_SHIFT; - last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; - - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); - - if (dmar_domain->max_addr == iova + size) - dmar_domain->max_addr = iova; - - /* - * We do not use page-selective IOTLB invalidation in flush queue, - * so there is no need to track page and sync iotlb. - */ - if (!iommu_iotlb_gather_queued(gather)) - iommu_iotlb_gather_add_page(domain, gather, iova, size); - - return size; -} - -static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - - return intel_iommu_unmap(domain, iova, size, gather); -} - static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { @@ -3671,24 +3151,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain, iommu_put_pages_list(&gather->freelist); } -static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct dma_pte *pte; - int level = 0; - u64 phys = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, - GFP_ATOMIC); - if (pte && dma_pte_present(pte)) - phys = dma_pte_addr(pte) + - (iova & (BIT_MASK(level_to_offset_bits(level) + - VTD_PAGE_SHIFT) - 1)); - - return phys; -} - static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; @@ -3730,15 +3192,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) struct dmar_domain *dmar_domain = to_dmar_domain(domain); guard(spinlock_irqsave)(&dmar_domain->lock); - if (!domain_support_force_snooping(dmar_domain) || - dmar_domain->has_mappings) + if (!domain_support_force_snooping(dmar_domain)) return false; /* * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit. */ - dmar_domain->set_pte_snp = true; + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); dmar_domain->force_snooping = true; return true; } @@ -4302,49 +3764,6 @@ err_unwind: return ret; } -static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long end = iova + size - 1; - unsigned long pgsize; - - /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. - */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; - - do { - struct dma_pte *pte; - int lvl = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, - GFP_ATOMIC); - pgsize = level_size(lvl) << VTD_PAGE_SHIFT; - if (!pte || !dma_pte_present(pte)) { - iova += pgsize; - continue; - } - - if (dma_sl_pte_test_and_clear_dirty(pte, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops intel_dirty_ops = { - .set_dirty_tracking = intel_iommu_set_dirty_tracking, - .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, -}; - static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -4401,7 +3820,9 @@ static int device_setup_pass_through(struct device *dev) context_setup_pass_through_cb, dev); } -static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -4462,27 +3883,23 @@ static struct iommu_domain identity_domain = { }; const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, }; const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; @@ -4797,3 +4214,5 @@ err: return ret; } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 3056583d7f56..25c5e22096d4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -23,8 +23,8 @@ #include <linux/xarray.h> #include <linux/perf_event.h> #include <linux/pci.h> +#include <linux/generic_pt/iommu.h> -#include <asm/cacheflush.h> #include <asm/iommu.h> #include <uapi/linux/iommufd.h> @@ -595,22 +595,20 @@ struct qi_batch { }; struct dmar_domain { - int nid; /* node id */ + union { + struct iommu_domain domain; + struct pt_iommu iommu; + /* First stage page table */ + struct pt_iommu_x86_64 fspt; + /* Second stage page table */ + struct pt_iommu_vtdss sspt; + }; + struct xarray iommu_array; /* Attached IOMMU array */ - u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 force_snooping : 1; /* Create IOPTEs with snoop control */ - u8 set_pte_snp:1; - u8 use_first_level:1; /* DMA translation for the domain goes - * through the first level page table, - * otherwise, goes through the second - * level. - */ + u8 force_snooping:1; /* Create PASID entry with snoop control */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ u8 nested_parent:1; /* Has other domains nested on it */ - u8 has_mappings:1; /* Has mappings configured through - * iommu_map() interface. - */ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write * buffer when creating mappings. */ @@ -623,26 +621,9 @@ struct dmar_domain { struct list_head cache_tags; /* Cache tag list */ struct qi_batch *qi_batch; /* Batched QI descriptors */ - int iommu_superpage;/* Level of superpages supported: - 0 == 4KiB (no superpages), 1 == 2MiB, - 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ union { /* DMA remapping domain */ struct { - /* virtual address */ - struct dma_pte *pgd; - /* max guest address width */ - int gaw; - /* - * adjusted guest address width: - * 0: level 2 30-bit - * 1: level 3 39-bit - * 2: level 4 48-bit - * 3: level 5 57-bit - */ - int agaw; - /* maximum mapped address */ - u64 max_addr; /* Protect the s1_domains list */ spinlock_t s1_lock; /* Track s1_domains nested on this domain */ @@ -664,10 +645,10 @@ struct dmar_domain { struct mmu_notifier notifier; }; }; - - struct iommu_domain domain; /* generic domain data structure for - iommu core */ }; +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); /* * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. @@ -866,11 +847,6 @@ struct dma_pte { u64 val; }; -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT @@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } -static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, - unsigned long flags) -{ - if (flags & IOMMU_DIRTY_NO_CLEAR) - return (pte->val & DMA_SL_PTE_DIRTY) != 0; - - return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, - (unsigned long *)&pte->val); -} - static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); } -static inline bool first_pte_in_page(struct dma_pte *pte) -{ - return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); -} - -static inline int nr_pte_to_next_page(struct dma_pte *pte) -{ - return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : - (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; -} - static inline bool context_present(struct context_entry *context) { return (context->lo & 1); @@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw) return agaw + 2; } -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - static inline int width_to_agaw(int width) { return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); @@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level) return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} static inline void context_set_present(struct context_entry *context) { @@ -1097,7 +1028,7 @@ static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, struct qi_desc *desc) { u8 dw = 0, dr = 0; - int ih = 0; + int ih = addr & 1; if (cap_write_drain(iommu->cap)) dw = 1; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 1b6ad9c900a5..a3fb8c193ca6 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, device_block_translation(dev); - if (iommu->agaw < dmar_domain->s2_domain->agaw) { - dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); - return -ENODEV; - } - /* * Stage-1 domain cannot work alone, it is nested on a s2_domain. * The s2_domain will be used in nested translation, hence needs diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 52f678975da7..3e2255057079 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); /* Setup Present and PASID Granular Transfer Type: */ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); @@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu, */ static void pasid_pte_config_second_level(struct intel_iommu *iommu, struct pasid_entry *pte, - u64 pgd_val, int agaw, u16 did, - bool dirty_tracking) + struct dmar_domain *domain, u16 did) { + struct pt_iommu_vtdss_hw_info pt_info; + lockdep_assert_held(&iommu->lock); + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); pasid_clear_entry(pte); pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_address_width(pte, pt_info.aw); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (dirty_tracking) + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (domain->dirty_tracking) pasid_set_ssade(pte); pasid_set_present(pte); @@ -484,10 +487,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct device *dev, u32 pasid) { struct pasid_entry *pte; - struct dma_pte *pgd; - u64 pgd_val; u16 did; + /* * If hardware advertises no support for second level * translation, return directly. @@ -498,8 +500,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); spin_lock(&iommu->lock); @@ -514,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, - did, domain->dirty_tracking); + pasid_pte_config_second_level(iommu, pte, domain, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -529,8 +528,6 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu, u32 pasid) { struct pasid_entry *pte, new_pte; - struct dma_pte *pgd; - u64 pgd_val; u16 did; /* @@ -543,13 +540,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); - pasid_pte_config_second_level(iommu, &new_pte, pgd_val, - domain->agaw, did, - domain->dirty_tracking); + pasid_pte_config_second_level(iommu, &new_pte, domain, did); spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); @@ -747,10 +740,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu, struct dmar_domain *s2_domain, u16 did) { - struct dma_pte *pgd = s2_domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; lockdep_assert_held(&iommu->lock); + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); + pasid_clear_entry(pte); if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) @@ -770,11 +765,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu, if (s2_domain->force_snooping) pasid_set_pgsnp(pte); - pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_slptr(pte, pt_info.ssptptr); pasid_set_fault_enable(pte); pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); if (s2_domain->dirty_tracking) pasid_set_ssade(pte); pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index a771a77d4239..b4c85242dc79 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -24,6 +24,7 @@ #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) +#define PASID_FLAG_PWSNP BIT(2) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e147f71f91b7..71de7947971f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags |= PASID_FLAG_PWSNP; ret = __domain_setup_first_level(iommu, dev, pasid, FLPT_DEFAULT_DID, __pa(mm->pgd), sflags, old); diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c new file mode 100644 index 000000000000..334e70350924 --- /dev/null +++ b/drivers/iommu/io-pgtable-arm-selftests.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU-agnostic ARM page table allocator. + * + * Copyright (C) 2014 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt + +#include <kunit/device.h> +#include <kunit/test.h> +#include <linux/io-pgtable.h> +#include <linux/kernel.h> + +#include "io-pgtable-arm.h" + +static struct io_pgtable_cfg *cfg_cookie; + +static void dummy_tlb_flush_all(void *cookie) +{ + WARN_ON(cookie != cfg_cookie); +} + +static void dummy_tlb_flush(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + WARN_ON(cookie != cfg_cookie); + WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); +} + +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + dummy_tlb_flush(iova, granule, granule, cookie); +} + +static const struct iommu_flush_ops dummy_tlb_ops = { + .tlb_flush_all = dummy_tlb_flush_all, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_add_page = dummy_tlb_add_page, +}; + +#define __FAIL(test, i) ({ \ + KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \ + -EFAULT; \ +}) + +static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg) +{ + static const enum io_pgtable_fmt fmts[] = { + ARM_64_LPAE_S1, + ARM_64_LPAE_S2, + }; + + int i, j; + unsigned long iova; + size_t size, mapped; + struct io_pgtable_ops *ops; + + for (i = 0; i < ARRAY_SIZE(fmts); ++i) { + cfg_cookie = cfg; + ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); + if (!ops) { + kunit_err(test, "failed to allocate io pgtable ops\n"); + return -ENOMEM; + } + + /* + * Initial sanity checks. + * Empty page tables shouldn't provide any translations. + */ + if (ops->iova_to_phys(ops, 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_1G + 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_2G + 42)) + return __FAIL(test, i); + + /* + * Distinct mappings of different granule sizes. + */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + /* Overlapping mappings */ + if (!ops->map_pages(ops, iova, iova + size, size, 1, + IOMMU_READ | IOMMU_NOEXEC, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* Full unmap */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42)) + return __FAIL(test, i); + + /* Remap full block */ + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_WRITE, GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* + * Map/unmap the last largest supported page of the IAS, this can + * trigger corner cases in the concatednated page tables. + */ + mapped = 0; + size = 1UL << __fls(cfg->pgsize_bitmap); + iova = (1UL << cfg->ias) - size; + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + if (mapped != size) + return __FAIL(test, i); + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + free_io_pgtable_ops(ops); + } + + return 0; +} + +static void arm_lpae_do_selftests(struct kunit *test) +{ + static const unsigned long pgsize[] = { + SZ_4K | SZ_2M | SZ_1G, + SZ_16K | SZ_32M, + SZ_64K | SZ_512M, + }; + + static const unsigned int address_size[] = { + 32, 36, 40, 42, 44, 48, + }; + + int i, j, k, pass = 0, fail = 0; + struct device *dev; + struct io_pgtable_cfg cfg = { + .tlb = &dummy_tlb_ops, + .coherent_walk = true, + .quirks = IO_PGTABLE_QUIRK_NO_WARN, + }; + + dev = kunit_device_register(test, "io-pgtable-test"); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + if (IS_ERR_OR_NULL(dev)) + return; + + cfg.iommu_dev = dev; + + for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { + for (j = 0; j < ARRAY_SIZE(address_size); ++j) { + /* Don't use ias > oas as it is not valid for stage-2. */ + for (k = 0; k <= j; ++k) { + cfg.pgsize_bitmap = pgsize[i]; + cfg.ias = address_size[k]; + cfg.oas = address_size[j]; + kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", + pgsize[i], cfg.ias, cfg.oas); + if (arm_lpae_run_tests(test, &cfg)) + fail++; + else + pass++; + } + } + } + + kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail); +} + +static struct kunit_case io_pgtable_arm_test_cases[] = { + KUNIT_CASE(arm_lpae_do_selftests), + {}, +}; + +static struct kunit_suite io_pgtable_arm_test = { + .name = "io-pgtable-arm-test", + .test_cases = io_pgtable_arm_test_cases, +}; + +kunit_test_suite(io_pgtable_arm_test); + +MODULE_DESCRIPTION("io-pgtable-arm library kunit tests"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 7e8e2216c294..e6626004b323 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -12,8 +12,6 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/device/faux.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/types.h> @@ -1267,204 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { .alloc = arm_mali_lpae_alloc_pgtable, .free = arm_lpae_free_pgtable, }; - -#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST - -static struct io_pgtable_cfg *cfg_cookie __initdata; - -static void __init dummy_tlb_flush_all(void *cookie) -{ - WARN_ON(cookie != cfg_cookie); -} - -static void __init dummy_tlb_flush(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ - WARN_ON(cookie != cfg_cookie); - WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); -} - -static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ - dummy_tlb_flush(iova, granule, granule, cookie); -} - -static const struct iommu_flush_ops dummy_tlb_ops __initconst = { - .tlb_flush_all = dummy_tlb_flush_all, - .tlb_flush_walk = dummy_tlb_flush, - .tlb_add_page = dummy_tlb_add_page, -}; - -static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) -{ - struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &data->iop.cfg; - - pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n", - cfg->pgsize_bitmap, cfg->ias); - pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n", - ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data), - ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd); -} - -#define __FAIL(ops, i) ({ \ - WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \ - arm_lpae_dump_ops(ops); \ - -EFAULT; \ -}) - -static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) -{ - static const enum io_pgtable_fmt fmts[] __initconst = { - ARM_64_LPAE_S1, - ARM_64_LPAE_S2, - }; - - int i, j; - unsigned long iova; - size_t size, mapped; - struct io_pgtable_ops *ops; - - for (i = 0; i < ARRAY_SIZE(fmts); ++i) { - cfg_cookie = cfg; - ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); - if (!ops) { - pr_err("selftest: failed to allocate io pgtable ops\n"); - return -ENOMEM; - } - - /* - * Initial sanity checks. - * Empty page tables shouldn't provide any translations. - */ - if (ops->iova_to_phys(ops, 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_2G + 42)) - return __FAIL(ops, i); - - /* - * Distinct mappings of different granule sizes. - */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_READ | IOMMU_WRITE | - IOMMU_NOEXEC | IOMMU_CACHE, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - /* Overlapping mappings */ - if (!ops->map_pages(ops, iova, iova + size, size, 1, - IOMMU_READ | IOMMU_NOEXEC, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - /* Full unmap */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42)) - return __FAIL(ops, i); - - /* Remap full block */ - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_WRITE, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - /* - * Map/unmap the last largest supported page of the IAS, this can - * trigger corner cases in the concatednated page tables. - */ - mapped = 0; - size = 1UL << __fls(cfg->pgsize_bitmap); - iova = (1UL << cfg->ias) - size; - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_READ | IOMMU_WRITE | - IOMMU_NOEXEC | IOMMU_CACHE, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - if (mapped != size) - return __FAIL(ops, i); - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) - return __FAIL(ops, i); - - free_io_pgtable_ops(ops); - } - - return 0; -} - -static int __init arm_lpae_do_selftests(void) -{ - static const unsigned long pgsize[] __initconst = { - SZ_4K | SZ_2M | SZ_1G, - SZ_16K | SZ_32M, - SZ_64K | SZ_512M, - }; - - static const unsigned int address_size[] __initconst = { - 32, 36, 40, 42, 44, 48, - }; - - int i, j, k, pass = 0, fail = 0; - struct faux_device *dev; - struct io_pgtable_cfg cfg = { - .tlb = &dummy_tlb_ops, - .coherent_walk = true, - .quirks = IO_PGTABLE_QUIRK_NO_WARN, - }; - - dev = faux_device_create("io-pgtable-test", NULL, 0); - if (!dev) - return -ENOMEM; - - cfg.iommu_dev = &dev->dev; - - for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { - for (j = 0; j < ARRAY_SIZE(address_size); ++j) { - /* Don't use ias > oas as it is not valid for stage-2. */ - for (k = 0; k <= j; ++k) { - cfg.pgsize_bitmap = pgsize[i]; - cfg.ias = address_size[k]; - cfg.oas = address_size[j]; - pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", - pgsize[i], cfg.ias, cfg.oas); - if (arm_lpae_run_tests(&cfg)) - fail++; - else - pass++; - } - } - } - - pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail); - faux_device_destroy(dev); - - return fail ? -EFAULT : 0; -} -subsys_initcall(arm_lpae_do_selftests); -#endif diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 8841c1487f00..843fec8e8a51 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, #endif -#ifdef CONFIG_AMD_IOMMU - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, -#endif }; static int check_custom_allocator(enum io_pgtable_fmt fmt, diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c index 238c09e5166b..3bab175d8557 100644 --- a/drivers/iommu/iommu-pages.c +++ b/drivers/iommu/iommu-pages.c @@ -4,6 +4,7 @@ * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include "iommu-pages.h" +#include <linux/dma-mapping.h> #include <linux/gfp.h> #include <linux/mm.h> @@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data); #undef IOPTDESC_MATCH static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); +static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) +{ + return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); +} + /** * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from * specific NUMA node @@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); */ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) { + struct ioptdesc *iopt; unsigned long pgcnt; struct folio *folio; unsigned int order; @@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) if (unlikely(!folio)) return NULL; + iopt = folio_ioptdesc(folio); + iopt->incoherent = false; + /* * All page allocations that should be reported to as "iommu-pagetables" * to userspace must use one of the functions below. This includes @@ -80,7 +90,10 @@ EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz); static void __iommu_free_desc(struct ioptdesc *iopt) { struct folio *folio = ioptdesc_folio(iopt); - const unsigned long pgcnt = 1UL << folio_order(folio); + const unsigned long pgcnt = folio_nr_pages(folio); + + if (IOMMU_PAGES_USE_DMA_API) + WARN_ON_ONCE(iopt->incoherent); mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); @@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list) __iommu_free_desc(iopt); } EXPORT_SYMBOL_GPL(iommu_put_pages_list); + +/** + * iommu_pages_start_incoherent - Setup the page for cache incoherent operation + * @virt: The page to setup + * @dma_dev: The iommu device + * + * For incoherent memory this will use the DMA API to manage the cache flushing + * on some arches. This is a lot of complexity compared to just calling + * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers + * have been doing. The DMA API requires keeping track of the DMA map and + * freeing it when required. This keeps track of the dma map inside the ioptdesc + * so that error paths are simple for the caller. + */ +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + dma_addr_t dma; + + if (WARN_ON(iopt->incoherent)) + return -EINVAL; + + if (!IOMMU_PAGES_USE_DMA_API) { + iommu_pages_flush_incoherent(dma_dev, virt, 0, + ioptdesc_mem_size(iopt)); + } else { + dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, dma)) + return -EINVAL; + + /* + * The DMA API is not allowed to do anything other than DMA + * direct. It would be nice to also check + * dev_is_dma_coherent(dma_dev)); + */ + if (WARN_ON(dma != virt_to_phys(virt))) { + dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + return -EOPNOTSUPP; + } + } + + iopt->incoherent = 1; + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent); + +/** + * iommu_pages_start_incoherent_list - Make a list of pages incoherent + * @list: The list of pages to setup + * @dma_dev: The iommu device + * + * Perform iommu_pages_start_incoherent() across all of list. + * + * If this fails the caller must call iommu_pages_stop_incoherent_list(). + */ +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + int ret; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + if (WARN_ON(cur->incoherent)) + continue; + + ret = iommu_pages_start_incoherent( + folio_address(ioptdesc_folio(cur)), dma_dev); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list); + +/** + * iommu_pages_stop_incoherent_list - Undo incoherence across a list + * @list: The list of pages to release + * @dma_dev: The iommu device + * + * Revert iommu_pages_start_incoherent() across all of the list. Pages that did + * not call or succeed iommu_pages_start_incoherent() will be ignored. + */ +#if IOMMU_PAGES_USE_DMA_API +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + struct folio *folio = ioptdesc_folio(cur); + + if (!cur->incoherent) + continue; + dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)), + ioptdesc_mem_size(cur), DMA_TO_DEVICE); + cur->incoherent = 0; + } +} +EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list); + +/** + * iommu_pages_free_incoherent - Free an incoherent page + * @virt: virtual address of the page to be freed. + * @dma_dev: The iommu device + * + * If the page is incoherent it made coherent again then freed. + */ +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + + if (iopt->incoherent) { + dma_unmap_single(dma_dev, virt_to_phys(virt), + ioptdesc_mem_size(iopt), DMA_TO_DEVICE); + iopt->incoherent = 0; + } + __iommu_free_desc(iopt); +} +EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent); +#endif diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index b3af2813ed0c..ae9da4f571f6 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -21,7 +21,10 @@ struct ioptdesc { struct list_head iopt_freelist_elm; unsigned long __page_mapping; - pgoff_t __index; + union { + u8 incoherent; + pgoff_t __index; + }; void *_private; unsigned int __page_type; @@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size) return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size); } -#endif /* __IOMMU_PAGES_H */ +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev); +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); + +#ifdef CONFIG_X86 +#define IOMMU_PAGES_USE_DMA_API 0 +#include <linux/cacheflush.h> + +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) +{ + clflush_cache_range(virt + offset, len); +} +static inline void +iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + /* + * For performance leave the incoherent flag alone which turns this into + * a NOP. For X86 the rest of the stop/free flow ignores the flag. + */ +} +static inline void iommu_pages_free_incoherent(void *virt, + struct device *dma_dev) +{ + iommu_free_pages(virt); +} +#else +#define IOMMU_PAGES_USE_DMA_API 1 +#include <linux/dma-mapping.h> + +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) +{ + dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len, + DMA_TO_DEVICE); +} +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev); +#endif + +#endif /* __IOMMU_PAGES_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 59244c744eab..2ca990dfbb88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); + struct device *dev, struct iommu_domain *old); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, @@ -114,6 +114,7 @@ enum { static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, @@ -542,8 +543,21 @@ static void iommu_deinit_device(struct device *dev) * Regardless, if a delayed attach never occurred, then the release * should still avoid touching any hardware configuration either. */ - if (!dev->iommu->attach_deferred && ops->release_domain) - ops->release_domain->ops->attach_dev(ops->release_domain, dev); + if (!dev->iommu->attach_deferred && ops->release_domain) { + struct iommu_domain *release_domain = ops->release_domain; + + /* + * If the device requires direct mappings then it should not + * be parked on a BLOCKED domain during release as that would + * break the direct mappings. + */ + if (dev->iommu->require_direct && ops->identity_domain && + release_domain == ops->blocked_domain) + release_domain = ops->identity_domain; + + release_domain->ops->attach_dev(release_domain, dev, + group->domain); + } if (ops->release_device) ops->release_device(dev); @@ -628,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { - ret = __iommu_device_set_domain(group, dev, group->domain, 0); + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, + 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { @@ -2115,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group) } static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - ret = domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; dev->iommu->attach_deferred = 0; @@ -2171,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev); + return __iommu_attach_device(domain, dev, NULL); return 0; } @@ -2284,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags) { int ret; @@ -2309,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group, dev->iommu->attach_deferred = 0; } - ret = __iommu_attach_device(new_domain, dev); + ret = __iommu_attach_device(new_domain, dev, old_domain); if (ret) { /* * If we have a blocking domain then try to attach that in hopes @@ -2319,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group, if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) - __iommu_attach_device(group->blocking_domain, dev); + __iommu_attach_device(group->blocking_domain, dev, + old_domain); return ret; } return 0; @@ -2366,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, - flags); + group->domain, flags); if (ret) { result = ret; /* @@ -2391,6 +2408,9 @@ err_revert: */ last_gdev = gdev; for_each_group_device(group, gdev) { + /* No need to revert the last gdev that failed to set domain */ + if (gdev == last_gdev) + break; /* * A NULL domain can happen only for first probe, in which case * we leave group->domain as NULL and let release clean @@ -2398,10 +2418,8 @@ err_revert: */ if (group->domain) WARN_ON(__iommu_device_set_domain( - group, gdev->dev, group->domain, + group, gdev->dev, group->domain, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); - if (gdev == last_gdev) - break; } return ret; } diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 2beeb4f60ee5..eae3f03629b0 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -41,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + depends on IOMMU_PT_AMDV1 select IOMMUFD_DRIVER default n help diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 75d60f2ad900..54cf4d856179 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,8 +8,10 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include <linux/dma-buf.h> #include <linux/err.h> #include <linux/errno.h> +#include <linux/file.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> @@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, case IOPT_ADDRESS_FILE: start = elm->start_byte + elm->pages->start; break; + case IOPT_ADDRESS_DMABUF: + start = elm->start_byte + elm->pages->dmabuf.start; + break; } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) @@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input - * @file: file to map + * @fd: fdno of a file to map * @start: map file starting at this byte offset * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero */ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, - unsigned long start, unsigned long length, - int iommu_prot, unsigned int flags) + unsigned long *iova, int fd, unsigned long start, + unsigned long length, int iommu_prot, + unsigned int flags) { struct iopt_pages *pages; + struct dma_buf *dmabuf; + unsigned long start_byte; + unsigned long last; + + if (!length) + return -EINVAL; + if (check_add_overflow(start, length - 1, &last)) + return -EOVERFLOW; + + start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); + dmabuf = dma_buf_get(fd); + if (!IS_ERR(dmabuf)) { + pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, + length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) { + dma_buf_put(dmabuf); + return PTR_ERR(pages); + } + } else { + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + pages = iopt_alloc_file_pages(file, start_byte, start, length, + iommu_prot & IOMMU_WRITE); + fput(file); + if (IS_ERR(pages)) + return PTR_ERR(pages); + } - pages = iopt_alloc_file_pages(file, start, length, - iommu_prot & IOMMU_WRITE); - if (IS_ERR(pages)) - return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, - start - pages->start, iommu_prot, flags); + start_byte, iommu_prot, flags); } struct iova_bitmap_fn_arg { @@ -961,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; + if (iopt_is_dmabuf(pages)) { + if (!iopt_dmabuf_revoked(pages)) + iopt_area_unmap_domain(area, domain); + iopt_dmabuf_untrack_domain(pages, area, domain); + } mutex_unlock(&pages->mutex); - iopt_area_unmap_domain(area, domain); + if (!iopt_is_dmabuf(pages)) + iopt_area_unmap_domain(area, domain); } return; } @@ -980,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } } @@ -1009,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt, if (!pages) continue; - mutex_lock(&pages->mutex); + guard(mutex)(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto out_unfill; + } rc = iopt_area_fill_domain(area, domain); if (rc) { - mutex_unlock(&pages->mutex); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); goto out_unfill; } if (!area->storage_domain) { @@ -1021,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt, interval_tree_insert(&area->pages_node, &pages->domains_itree); } - mutex_unlock(&pages->mutex); } return 0; @@ -1042,6 +1088,8 @@ out_unfill: area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } return rc; @@ -1252,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (!pages || area->prevent_access) return -EBUSY; + /* Maintaining the domains_itree below is a bit complicated */ + if (iopt_is_dmabuf(pages)) + return -EOPNOTSUPP; + if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index b6064f4ce4af..14cd052fd320 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -5,6 +5,7 @@ #ifndef __IO_PAGETABLE_H #define __IO_PAGETABLE_H +#include <linux/dma-buf.h> #include <linux/interval_tree.h> #include <linux/kref.h> #include <linux/mutex.h> @@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain); +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain); +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain); +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages); +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -179,7 +190,22 @@ enum { enum iopt_address_type { IOPT_ADDRESS_USER = 0, - IOPT_ADDRESS_FILE = 1, + IOPT_ADDRESS_FILE, + IOPT_ADDRESS_DMABUF, +}; + +struct iopt_pages_dmabuf_track { + struct iommu_domain *domain; + struct iopt_area *area; + struct list_head elm; +}; + +struct iopt_pages_dmabuf { + struct dma_buf_attachment *attach; + struct dma_buf_phys_vec phys; + /* Always PAGE_SIZE aligned */ + unsigned long start; + struct list_head tracker; }; /* @@ -209,6 +235,8 @@ struct iopt_pages { struct file *file; unsigned long start; }; + /* IOPT_ADDRESS_DMABUF */ + struct iopt_pages_dmabuf dmabuf; }; bool writable:1; u8 account_mode; @@ -220,10 +248,32 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; +static inline bool iopt_is_dmabuf(struct iopt_pages *pages) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return false; + return pages->type == IOPT_ADDRESS_DMABUF; +} + +static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) +{ + lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) + return pages->dmabuf.phys.len == 0; + return false; +} + struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 459a7c516915..f4721afedadc 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) unsigned long iova = cmd->iova; struct iommufd_ioas *ioas; unsigned int flags = 0; - struct file *file; int rc; if (cmd->flags & @@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) flags = IOPT_ALLOC_IOVA; - file = fget(cmd->fd); - if (!file) - return -EBADF; - - rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd, cmd->start, cmd->length, conv_iommu_prot(cmd->flags), flags); if (rc) @@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: iommufd_put_object(ucmd->ictx, &ioas->obj); - fput(file); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 85d0843ed07b..eb6d1a70f673 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,8 @@ struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; +struct dma_buf_attachment; +struct dma_buf_phys_vec; struct iommufd_sw_msi_map { struct list_head sw_msi_item; @@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, + unsigned long *iova, int fd, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, @@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); +struct device *iommufd_global_device(void); + struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; @@ -713,6 +717,8 @@ bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, @@ -734,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } +static inline int +iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf9..73e73e1ec158 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -29,11 +29,22 @@ enum { IOMMU_TEST_OP_PASID_REPLACE, IOMMU_TEST_OP_PASID_DETACH, IOMMU_TEST_OP_PASID_CHECK_HWPT, + IOMMU_TEST_OP_DMABUF_GET, + IOMMU_TEST_OP_DMABUF_REVOKE, }; enum { + MOCK_IOMMUPT_DEFAULT = 0, + MOCK_IOMMUPT_HUGE, + MOCK_IOMMUPT_AMDV1, +}; + +/* These values are true for MOCK_IOMMUPT_DEFAULT */ +enum { MOCK_APERTURE_START = 1UL << 24, MOCK_APERTURE_LAST = (1UL << 31) - 1, + MOCK_PAGE_SIZE = 2048, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, }; enum { @@ -52,7 +63,6 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; @@ -176,6 +186,14 @@ struct iommu_test_cmd { __u32 hwpt_id; /* @id is stdev_id */ } pasid_check; + struct { + __u32 length; + __u32 open_flags; + } dmabuf_get; + struct { + __s32 dmabuf_fd; + __u32 revoked; + } dmabuf_revoke; }; __u32 last; }; @@ -205,6 +223,7 @@ struct iommu_test_hw_info { */ struct iommu_hwpt_selftest { __u32 iotlb; + __u32 pagetable_type; }; /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ce775fbbae94..5cc4b08c25f5 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = { .mode = 0666, }; +/* + * Used only by DMABUF, returns a valid struct device to use as a dummy struct + * device for attachment. + */ +struct device *iommufd_global_device(void) +{ + return iommu_misc_dev.this_device; +} + static int __init iommufd_init(void) { int ret; @@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_IMPORT_NS("IOMMUFD"); +MODULE_IMPORT_NS("DMA_BUF"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index c3433b845561..dbe51ecb9a20 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,8 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/iommu.h> @@ -53,6 +55,7 @@ #include <linux/overflow.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/vfio_pci_core.h> #include "double_span.h" #include "io_pagetable.h" @@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, return container_of(node, struct iopt_area, pages_node); } +enum batch_kind { + BATCH_CPU_MEMORY = 0, + BATCH_MMIO, +}; + /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one @@ -271,7 +279,9 @@ struct pfn_batch { unsigned int array_size; unsigned int end; unsigned int total_pfns; + enum batch_kind kind; }; +enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { @@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) } static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, - u32 nr) + u32 nr, enum batch_kind kind) { - const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); unsigned int end = batch->end; + if (batch->kind != kind) { + /* One kind per batch */ + if (batch->end != 0) + return false; + batch->kind = kind; + } + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && nr <= MAX_NPFNS - batch->npfns[end - 1]) { batch->npfns[end - 1] += nr; @@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { - return batch_add_pfn_num(batch, pfn, 1); + return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); } /* @@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); + int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; @@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, unsigned long iova; int rc; + if (batch->kind == BATCH_MMIO) { + iommu_prot &= ~IOMMU_CACHE; + iommu_prot |= IOMMU_MMIO; + } + /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; @@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot); + next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot, + next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; @@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, nr = min(nr, npages); npages -= nr; - if (!batch_add_pfn_num(batch, pfn, nr)) + if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; if (nr > 1) { rc = folio_add_pins(folio, nr - 1); @@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, return iopt_pages_update_pinned(pages, npages, inc, user); } +struct pfn_reader_dmabuf { + struct dma_buf_phys_vec phys; + unsigned long start_offset; +}; + +static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, + struct iopt_pages *pages) +{ + /* Callers must not get here if the dmabuf was already revoked */ + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return -EINVAL; + + dmabuf->phys = pages->dmabuf.phys; + dmabuf->start_offset = pages->dmabuf.start; + return 0; +} + +static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, + struct pfn_batch *batch, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; + + /* + * start/last_index and start are all PAGE_SIZE aligned, the batch is + * always filled using page size aligned PFNs just like the other types. + * If the dmabuf has been sliced on a sub page offset then the common + * batch to domain code will adjust it before mapping to the domain. + */ + batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), + last_index - start_index + 1, BATCH_MMIO); + return 0; +} + /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a @@ -1072,7 +1129,10 @@ struct pfn_reader { unsigned long batch_end_index; unsigned long last_index; - struct pfn_reader_user user; + union { + struct pfn_reader_user user; + struct pfn_reader_dmabuf dmabuf; + }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) @@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; unsigned long npages; struct iopt_area *area; int rc; @@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return 0; } - if (start_index >= pfns->user.upages_end) { - rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + if (iopt_is_dmabuf(pfns->pages)) + return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, + start_index, span->last_hole); + + user = &pfns->user; + if (start_index >= user->upages_end) { + rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; @@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; - pfn_reader_user_init(&pfns->user, pages); + if (iopt_is_dmabuf(pages)) + pfn_reader_dmabuf_init(&pfns->dmabuf, pages); + else + pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; @@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; + + if (iopt_is_dmabuf(pages)) + return; + user = &pfns->user; if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ @@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns) struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); - pfn_reader_user_destroy(&pfns->user, pfns->pages); + if (!iopt_is_dmabuf(pfns->pages)) + pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } @@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, return pages; } -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable) { struct iopt_pages *pages; - unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); - unsigned long end; - if (length && check_add_overflow(start, length - 1, &end)) - return ERR_PTR(-EOVERFLOW); - - pages = iopt_alloc_pages(start - start_down, length, writable); + pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; pages->file = get_file(file); - pages->start = start_down; + pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; return pages; } +static void iopt_revoke_notify(struct dma_buf_attachment *attach) +{ + struct iopt_pages *pages = attach->importer_priv; + struct iopt_pages_dmabuf_track *track; + + guard(mutex)(&pages->mutex); + if (iopt_dmabuf_revoked(pages)) + return; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + struct iopt_area *area = track->area; + + iopt_area_unmap_domain_range(area, track->domain, + iopt_area_index(area), + iopt_area_last_index(area)); + } + pages->dmabuf.phys.len = 0; +} + +static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { + .allow_peer2peer = true, + .move_notify = iopt_revoke_notify, +}; + +/* + * iommufd and vfio have a circular dependency. Future work for a phys + * based private interconnect will remove this. + */ +static int +sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + typeof(&vfio_pci_dma_buf_iommufd_map) fn; + int rc; + + rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); + if (rc != -EOPNOTSUPP) + return rc; + + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) + return -EOPNOTSUPP; + + fn = symbol_get(vfio_pci_dma_buf_iommufd_map); + if (!fn) + return -EOPNOTSUPP; + rc = fn(attachment, phys); + symbol_put(vfio_pci_dma_buf_iommufd_map); + return rc; +} + +static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, + struct dma_buf *dmabuf) +{ + struct dma_buf_attachment *attach; + int rc; + + attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), + &iopt_dmabuf_attach_revoke_ops, pages); + if (IS_ERR(attach)) + return PTR_ERR(attach); + + dma_resv_lock(dmabuf->resv, NULL); + /* + * Lock ordering requires the mutex to be taken inside the reservation, + * make sure lockdep sees this. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + mutex_lock(&pages->mutex); + mutex_unlock(&pages->mutex); + } + + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_detach; + + dma_resv_unlock(dmabuf->resv); + + /* On success iopt_release_pages() will detach and put the dmabuf. */ + pages->dmabuf.attach = attach; + return 0; + +err_detach: + dma_resv_unlock(dmabuf->resv); + dma_buf_detach(dmabuf, attach); + return rc; +} + +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) +{ + static struct lock_class_key pages_dmabuf_mutex_key; + struct iopt_pages *pages; + int rc; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return ERR_PTR(-EOPNOTSUPP); + + if (dmabuf->size <= (start + length - 1) || + length / PAGE_SIZE >= MAX_NPFNS) + return ERR_PTR(-EINVAL); + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + + /* + * The mmap_lock can be held when obtaining the dmabuf reservation lock + * which creates a locking cycle with the pages mutex which is held + * while obtaining the mmap_lock. This locking path is not present for + * IOPT_ADDRESS_DMABUF so split the lock class. + */ + lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); + + /* dmabuf does not use pinned page accounting. */ + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + pages->type = IOPT_ADDRESS_DMABUF; + pages->dmabuf.start = start - start_byte; + INIT_LIST_HEAD(&pages->dmabuf.tracker); + + rc = iopt_map_dmabuf(ictx, pages, dmabuf); + if (rc) { + iopt_put_pages(pages); + return ERR_PTR(rc); + } + + return pages; +} + +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + if (WARN_ON(!iopt_is_dmabuf(pages))) + return -EINVAL; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->domain == domain && track->area == area)) + return -EINVAL; + + track = kzalloc(sizeof(*track), GFP_KERNEL); + if (!track) + return -ENOMEM; + track->domain = domain; + track->area = area; + list_add_tail(&track->elm, &pages->dmabuf.tracker); + + return 0; +} + +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + WARN_ON(!iopt_is_dmabuf(pages)); + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + if (track->domain == domain && track->area == area) { + list_del(&track->elm); + kfree(track); + return; + } + } + WARN_ON(true); +} + +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iommu_domain *domain; + unsigned long index; + int rc; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->area == area)) + return -EINVAL; + + xa_for_each(&area->iopt->domains, index, domain) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto err_untrack; + } + return 0; +err_untrack: + iopt_dmabuf_untrack_all_domains(area, pages); + return rc; +} + +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iopt_pages_dmabuf_track *tmp; + + list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, + elm) { + if (track->area == area) { + list_del(&track->elm); + kfree(track); + } + } +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { + struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + + dma_buf_detach(dmabuf, pages->dmabuf.attach); + dma_buf_put(dmabuf); + WARN_ON(!list_empty(&pages->dmabuf.tracker)); + } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); + } kfree(pages); } @@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area, lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return; + iopt_area_unmap_domain_range(area, domain, start_index, + last_index); + return; + } + /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. @@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { + if (iopt_dmabuf_revoked(pages)) + return; + __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } @@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) lockdep_assert_held(&area->pages->mutex); + if (iopt_dmabuf_revoked(area->pages)) + return 0; + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) @@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); - rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), - iopt_area_last_index(area)); - if (rc) - goto out_unlock; + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_all_domains(area, pages); + if (rc) + goto out_unlock; + } - while (!pfn_reader_done(&pfns)) { - done_first_end_index = pfns.batch_end_index; - done_all_end_index = pfns.batch_start_index; - xa_for_each(&area->iopt->domains, index, domain) { - rc = batch_to_domain(&pfns.batch, domain, area, - pfns.batch_start_index); + if (!iopt_dmabuf_revoked(pages)) { + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_untrack; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } - done_all_end_index = done_first_end_index; - - rc = pfn_reader_next(&pfns); + rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; + + pfn_reader_destroy(&pfns); } - rc = pfn_reader_update_pinned(&pfns); - if (rc) - goto out_unmap; area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); - goto out_destroy; + mutex_unlock(&pages->mutex); + return 0; out_unmap: pfn_reader_release_pins(&pfns); @@ -1658,8 +1971,10 @@ out_unmap: end_index); } } -out_destroy: pfn_reader_destroy(&pfns); +out_untrack: + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; @@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) if (!area->storage_domain) goto out_unlock; - xa_for_each(&iopt->domains, index, domain) - if (domain != area->storage_domain) + xa_for_each(&iopt->domains, index, domain) { + if (domain == area->storage_domain) + continue; + + if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); + } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); @@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages)) + return -EINVAL; + + if (pages->type != IOPT_ADDRESS_USER) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); - if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(pages->type != IOPT_ADDRESS_USER)) - return -EINVAL; - if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index de178827a078..c4322fd26f93 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -5,6 +5,8 @@ */ #include <linux/anon_inodes.h> #include <linux/debugfs.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/fault-inject.h> #include <linux/file.h> #include <linux/iommu.h> @@ -12,6 +14,8 @@ #include <linux/slab.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> +#include "../iommu-pages.h" #include "../iommu-priv.h" #include "io_pagetable.h" @@ -41,21 +45,6 @@ static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, - - /* - * Like a real page table alignment requires the low bits of the address - * to be zero. xarray also requires the high bit to be zero, so we store - * the pfns shifted. The upper bits are used for metadata. - */ - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, - - _MOCK_PFN_START = MOCK_PFN_MASK + 1, - MOCK_PFN_START_IOVA = _MOCK_PFN_START, - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); @@ -124,10 +113,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + }; unsigned long flags; - struct iommu_domain domain; - struct xarray pfns; }; +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); static inline struct mock_iommu_domain * to_mock_domain(struct iommu_domain *domain) @@ -216,7 +210,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) } static int mock_domain_nop_attach(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mock_dev *mdev = to_mock_dev(dev); struct mock_viommu *new_viommu = NULL; @@ -344,74 +338,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, - unsigned long iova, size_t page_size, - unsigned long flags) -{ - unsigned long cur, end = iova + page_size - 1; - bool dirty = false; - void *ent, *old; - - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) - continue; - - dirty = true; - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - } - - return dirty; -} - -static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long end = iova + size; - void *ent; - - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) - return -EINVAL; - - do { - unsigned long pgsize = MOCK_IO_PAGE_SIZE; - unsigned long head; - - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent) { - iova += pgsize; - continue; - } - - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) - pgsize = MOCK_HUGE_PAGE_SIZE; - head = iova & ~(pgsize - 1); - - /* Clear dirty */ - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops dirty_ops = { - .set_dirty_tracking = mock_domain_set_dirty_tracking, - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, -}; - static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { @@ -446,7 +372,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (!parent || parent->ops != mock_ops.default_domain_ops) + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) return ERR_PTR(-EINVAL); mock_parent = to_mock_domain(parent); @@ -459,159 +385,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, return &mock_nested->domain; } -static struct iommu_domain * -mock_domain_alloc_paging_flags(struct device *dev, u32 flags, - const struct iommu_user_data *user_data) -{ - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_PASID; - struct mock_dev *mdev = to_mock_dev(dev); - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct mock_iommu_domain *mock; - - if (user_data) - return ERR_PTR(-EOPNOTSUPP); - if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return ERR_PTR(-ENOMEM); - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - - if (has_dirty_flag) - mock->domain.dirty_ops = &dirty_ops; - return &mock->domain; -} - static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = to_mock_domain(domain); - WARN_ON(!xa_empty(&mock->pfns)); + pt_iommu_deinit(&mock->iommu); kfree(mock); } -static int mock_domain_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) +static void mock_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long flags = MOCK_PFN_START_IOVA; - unsigned long start_iova = iova; + iommu_put_pages_list(&gather->freelist); +} - /* - * xarray does not reliably work with fault injection because it does a - * retry allocation, so put our own failure point. - */ - if (iommufd_should_fail()) - return -ENOENT; +static const struct iommu_domain_ops amdv1_mock_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); - for (; pgcount; pgcount--) { - size_t cur; +static const struct iommu_domain_ops amdv1_mock_huge_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; +#undef pt_iommu_amdv1_mock_map_pages - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - void *old; +static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1_mock), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - flags = MOCK_PFN_LAST_IOVA; - if (pgsize != MOCK_IO_PAGE_SIZE) { - flags |= MOCK_PFN_HUGE_IOVA; - } - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | - flags), - gfp); - if (xa_is_err(old)) { - for (; start_iova != iova; - start_iova += MOCK_IO_PAGE_SIZE) - xa_erase(&mock->pfns, - start_iova / - MOCK_IO_PAGE_SIZE); - return xa_err(old); - } - WARN_ON(old); - iova += MOCK_IO_PAGE_SIZE; - paddr += MOCK_IO_PAGE_SIZE; - *mapped += MOCK_IO_PAGE_SIZE; - flags = 0; - } - } - return 0; -} +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; -static size_t mock_domain_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *iotlb_gather) +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; + +static struct mock_iommu_domain * +mock_domain_alloc_pgtable(struct device *dev, + const struct iommu_hwpt_selftest *user_cfg, u32 flags) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - bool first = true; - size_t ret = 0; - void *ent; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return ERR_PTR(-ENOMEM); + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - for (; pgcount; pgcount--) { - size_t cur; + mock->amdv1.iommu.nid = NUMA_NO_NODE; + + switch (user_cfg->pagetable_type) { + case MOCK_IOMMUPT_DEFAULT: + case MOCK_IOMMUPT_HUGE: { + struct pt_iommu_amdv1_cfg cfg = {}; + + /* The mock version has a 2k page size */ + cfg.common.hw_max_vasz_lg2 = 56; + cfg.common.hw_max_oasz_lg2 = 51; + cfg.starting_level = 2; + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.ops = &amdv1_mock_huge_ops; + else + mock->domain.ops = &amdv1_mock_ops; + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + + /* + * In huge mode userspace should only provide huge pages, we + * have to include PAGE_SIZE for the domain to be accepted by + * iommufd. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | + PAGE_SIZE; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; + break; + } - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + case MOCK_IOMMUPT_AMDV1: { + struct pt_iommu_amdv1_cfg cfg = {}; + + cfg.common.hw_max_vasz_lg2 = 64; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + cfg.starting_level = 2; + mock->domain.ops = &amdv1_ops; + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_dirty_ops; + break; + } + default: + rc = -EOPNOTSUPP; + goto err_free; + } - /* - * iommufd generates unmaps that must be a strict - * superset of the map's performend So every - * starting/ending IOVA should have been an iova passed - * to map. - * - * This simple logic doesn't work when the HUGE_PAGE is - * turned on since the core code will automatically - * switch between the two page sizes creating a break in - * the unmap calls. The break can land in the middle of - * contiguous IOVA. - */ - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; - } - if (pgcount == 1 && - cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); - } + /* + * Override the real aperture to the MOCK aperture for test purposes. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { + WARN_ON(mock->domain.geometry.aperture_start != 0); + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); - iova += MOCK_IO_PAGE_SIZE; - ret += MOCK_IO_PAGE_SIZE; - } + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; } - return ret; + + return mock; +err_free: + kfree(mock); + return ERR_PTR(rc); } -static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) +static struct iommu_domain * +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - void *ent; + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_hwpt_selftest user_cfg = {}; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && + user_data->type != IOMMU_HWPT_DATA_NONE)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data) { + rc = iommu_copy_struct_from_user( + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + } + + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); + if (IS_ERR(mock)) + return ERR_CAST(mock); + return &mock->domain; } static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) @@ -955,15 +892,6 @@ static const struct iommu_ops mock_ops = { .user_pasid_table = true, .get_viommu_size = mock_get_viommu_size, .viommu_init = mock_viommu_init, - .default_domain_ops = - &(struct iommu_domain_ops){ - .free = mock_domain_free, - .attach_dev = mock_domain_nop_attach, - .map_pages = mock_domain_map_pages, - .unmap_pages = mock_domain_unmap_pages, - .iova_to_phys = mock_domain_iova_to_phys, - .set_dev_pasid = mock_domain_set_dev_pasid_nop, - }, }; static void mock_domain_free_nested(struct iommu_domain *domain) @@ -1047,7 +975,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || - hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt->domain->owner != &mock_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } @@ -1088,7 +1016,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) {}, }; const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | - MOCK_FLAGS_DEVICE_HUGE_IOVA | MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; @@ -1277,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; + unsigned int page_size; uintptr_t end; int rc; - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) - return -EINVAL; - hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - for (; length; length -= MOCK_IO_PAGE_SIZE) { + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); + if (iova % page_size || length % page_size || + (uintptr_t)uptr % page_size || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= page_size) { struct page *pages[1]; + phys_addr_t io_phys; unsigned long pfn; long npages; - void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); @@ -1308,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, pfn = page_to_pfn(pages[0]); put_page(pages[0]); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent || - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); + if (io_phys != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } - iova += MOCK_IO_PAGE_SIZE; - uptr += MOCK_IO_PAGE_SIZE; + iova += page_size; + uptr += page_size; } rc = 0; @@ -1795,7 +1723,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - if (!(mock->flags & MOCK_DIRTY_TRACK)) { + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { rc = -EINVAL; goto out_put; } @@ -1814,22 +1742,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, } for (i = 0; i < max; i++) { - unsigned long cur = iova + i * page_size; - void *ent, *old; - if (!test_bit(i, (unsigned long *)tmp)) continue; - - ent = xa_load(&mock->pfns, cur / page_size); - if (ent) { - unsigned long val; - - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / page_size, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - count++; - } + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); + count++; } cmd->dirty.out_nr_dirty = count; @@ -2031,6 +1947,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj) } } +struct iommufd_test_dma_buf { + void *memory; + size_t length; + bool revoked; +}; + +static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + return 0; +} + +static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ +} + +static struct sg_table * +iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ +} + +static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf) +{ + struct iommufd_test_dma_buf *priv = dmabuf->priv; + + kfree(priv->memory); + kfree(priv); +} + +static const struct dma_buf_ops iommufd_test_dmabuf_ops = { + .attach = iommufd_test_dma_buf_attach, + .detach = iommufd_test_dma_buf_detach, + .map_dma_buf = iommufd_test_dma_buf_map, + .release = iommufd_test_dma_buf_release, + .unmap_dma_buf = iommufd_test_dma_buf_unmap, +}; + +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + phys->paddr = virt_to_phys(priv->memory); + phys->len = priv->length; + return 0; +} + +static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd, + unsigned int open_flags, + size_t len) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc; + + len = ALIGN(len, PAGE_SIZE); + if (len == 0 || len > PAGE_SIZE * 512) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->length = len; + priv->memory = kzalloc(len, GFP_KERNEL); + if (!priv->memory) { + rc = -ENOMEM; + goto err_free; + } + + exp_info.ops = &iommufd_test_dmabuf_ops; + exp_info.size = len; + exp_info.flags = open_flags; + exp_info.priv = priv; + + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + rc = PTR_ERR(dmabuf); + goto err_free; + } + + return dma_buf_fd(dmabuf, open_flags); + +err_free: + kfree(priv->memory); + kfree(priv); + return rc; +} + +static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, + bool revoked) +{ + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc = 0; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + if (dmabuf->ops != &iommufd_test_dmabuf_ops) { + rc = -EOPNOTSUPP; + goto err_put; + } + + priv = dmabuf->priv; + dma_resv_lock(dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(dmabuf); + dma_resv_unlock(dmabuf->resv); + +err_put: + dma_buf_put(dmabuf); + return rc; +} + int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; @@ -2109,6 +2159,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_pasid_detach(ucmd, cmd); case IOMMU_TEST_OP_PASID_CHECK_HWPT: return iommufd_test_pasid_check_hwpt(ucmd, cmd); + case IOMMU_TEST_OP_DMABUF_GET: + return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags, + cmd->dmabuf_get.length); + case IOMMU_TEST_OP_DMABUF_REVOKE: + return iommufd_test_dmabuf_revoke(ucmd, + cmd->dmabuf_revoke.dmabuf_fd, + cmd->dmabuf_revoke.revoked); default: return -EOPNOTSUPP; } @@ -2202,3 +2259,5 @@ void iommufd_test_exit(void) platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ffa892f65714..ca848288dbf2 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) } static int ipmmu_attach_device(struct iommu_domain *io_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); @@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, } static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain; unsigned int i; - if (io_domain == identity_domain || !io_domain) + if (old == identity_domain || !old) return 0; - domain = to_vmsa_domain(io_domain); + domain = to_vmsa_domain(old); for (i = 0; i < fwspec->num_ids; ++i) ipmmu_utlb_disable(domain, fwspec->ids[i]); @@ -720,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev)); + put_device(&ipmmu_pdev->dev); + return 0; } diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 43a61ba021a5..819add75a665 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; unsigned long flags; @@ -441,19 +442,19 @@ fail: } static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; int ret = 0; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - priv = to_msm_priv(domain); + priv = to_msm_priv(old); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 0e0285348d2b..60fcd3d3b5eb 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -139,6 +139,7 @@ /* 2 bits: iommu type */ #define MTK_IOMMU_TYPE_MM (0x0 << 13) #define MTK_IOMMU_TYPE_INFRA (0x1 << 13) +#define MTK_IOMMU_TYPE_APU (0x2 << 13) #define MTK_IOMMU_TYPE_MASK (0x3 << 13) /* PM and clock always on. e.g. infra iommu */ #define PM_CLK_AO BIT(15) @@ -147,6 +148,7 @@ #define TF_PORT_TO_ADDR_MT8173 BIT(18) #define INT_ID_PORT_WIDTH_6 BIT(19) #define CFG_IFA_MASTER_IN_ATF BIT(20) +#define DL_WITH_MULTI_LARB BIT(21) #define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \ ((((pdata)->flags) & (mask)) == (_x)) @@ -172,6 +174,7 @@ enum mtk_iommu_plat { M4U_MT8183, M4U_MT8186, M4U_MT8188, + M4U_MT8189, M4U_MT8192, M4U_MT8195, M4U_MT8365, @@ -335,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban */ #define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL +static LIST_HEAD(apulist); /* List the apu iommu HWs */ +static LIST_HEAD(infralist); /* List the iommu_infra HW */ static LIST_HEAD(m4ulist); /* List all the M4U HWs */ #define for_each_m4u(data, head) list_for_each_entry(data, head, list) @@ -350,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = { #define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \ MT8192_MULTI_REGION_NR_MAX : 1) +static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = { + { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */ +#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) + { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */ + { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */ + { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */ +#endif +}; + static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = { { .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */ #if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) @@ -705,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain) } static int mtk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -773,12 +787,12 @@ err_unlock: } static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; mtk_iommu_config(data, dev, false, 0); @@ -865,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct device_link *link; struct device *larbdev; + unsigned long larbid_msk = 0; unsigned int larbid, larbidx, i; if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) @@ -872,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) /* * Link the consumer device with the smi-larb device(supplier). - * The device that connects with each a larb is a independent HW. - * All the ports in each a device should be in the same larbs. + * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs, + * we should create device link with each larb. + * w/o DL_WITH_MULTI_LARB: the master must connect with one larb, + * otherwise fail. */ larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); if (larbid >= MTK_LARB_NR_MAX) return ERR_PTR(-EINVAL); + larbid_msk |= BIT(larbid); + for (i = 1; i < fwspec->num_ids; i++) { larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]); - if (larbid != larbidx) { + if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) { + larbid_msk |= BIT(larbidx); + } else if (larbid != larbidx) { dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n", larbid, larbidx); return ERR_PTR(-EINVAL); } } - larbdev = data->larb_imu[larbid].dev; - if (!larbdev) - return ERR_PTR(-EINVAL); - link = device_link_add(dev, larbdev, - DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); - if (!link) - dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + for_each_set_bit(larbid, &larbid_msk, 32) { + larbdev = data->larb_imu[larbid].dev; + if (!larbdev) + return ERR_PTR(-EINVAL); + + link = device_link_add(dev, larbdev, + DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); + if (!link) { + dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + goto link_remove; + } + } + return &data->iommu; + +link_remove: + for_each_set_bit(i, &larbid_msk, larbid) { + larbdev = data->larb_imu[i].dev; + device_link_remove(dev, larbdev); + } + + return ERR_PTR(-ENODEV); } static void mtk_iommu_release_device(struct device *dev) @@ -903,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev) struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct device *larbdev; - unsigned int larbid; + unsigned int larbid, i; + unsigned long larbid_msk = 0; data = dev_iommu_priv_get(dev); - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { - larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); + if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) + return; + + for (i = 0; i < fwspec->num_ids; i++) { + larbid = MTK_M4U_TO_LARB(fwspec->ids[i]); + larbid_msk |= BIT(larbid); + } + + for_each_set_bit(larbid, &larbid_msk, 32) { larbdev = data->larb_imu[larbid].dev; device_link_remove(dev, larbdev); } @@ -974,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } return iommu_fwspec_add_ids(dev, args->args, 1); @@ -1211,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m } component_match_add(dev, match, component_compare_dev, &plarbdev->dev); - platform_device_put(plarbdev); } - if (!frst_avail_smicomm_node) - return -EINVAL; + if (!frst_avail_smicomm_node) { + ret = -EINVAL; + goto err_larbdev_put; + } pcommdev = of_find_device_by_node(frst_avail_smicomm_node); of_node_put(frst_avail_smicomm_node); - if (!pcommdev) - return -ENODEV; + if (!pcommdev) { + ret = -ENODEV; + goto err_larbdev_put; + } data->smicomm_dev = &pcommdev->dev; link = device_link_add(data->smicomm_dev, dev, @@ -1228,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m platform_device_put(pcommdev); if (!link) { dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev)); - return -EINVAL; + ret = -EINVAL; + goto err_larbdev_put; } return 0; err_larbdev_put: - for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) { - if (!data->larb_imu[i].dev) - continue; + /* id mapping may not be linear, loop the whole array */ + for (i = 0; i < MTK_LARB_NR_MAX; i++) put_device(data->larb_imu[i].dev); - } + return ret; } @@ -1400,8 +1448,12 @@ out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); out_list_del: list_del(&data->list); - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) + if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, dev); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + } out_runtime_disable: pm_runtime_disable(dev); return ret; @@ -1421,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev) if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, &pdev->dev); component_master_del(&pdev->dev, &mtk_iommu_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } pm_runtime_disable(&pdev->dev); for (i = 0; i < data->plat_data->banks_num; i++) { @@ -1695,6 +1750,66 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = { 27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}}, }; +static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = { + [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */ + [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */ + [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */ + [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_apu = { + .m4u_plat = M4U_MT8189, + .flags = IOVA_34_EN | DCM_DISABLE | + MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN, + .hw_list = &apulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 1, + .banks_enable = {true}, + .iova_region = mt8189_multi_dom_apu, + .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu), + .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}}, + .iova_region_larb_msk = mt8189_apu_region_msk, +}; + +static const struct mtk_iommu_plat_data mt8189_data_infra = { + .m4u_plat = M4U_MT8189, + .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA | + CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN, + .hw_list = &infralist, + .banks_num = 1, + .banks_enable = {true}, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .iova_region = single_domain, + .iova_region_nr = ARRAY_SIZE(single_domain), +}; + +static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { + [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */ + [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */ + [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */ + [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */ + [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */ + [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */ + [21] = ~0}, /* Region2: larb21 fake GCE larb */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_mm = { + .m4u_plat = M4U_MT8189, + .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN | + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | + PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB, + .hw_list = &m4ulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 5, + .banks_enable = {true, false, false, false, false}, + .iova_region = mt8192_multi_dom, + .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom), + .iova_region_larb_msk = mt8189_larb_region_msk, + .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2}, + {19, 20, 9, 11}, {7}, {4}, + {13, 17}, {14, 16}}, +}; + static const struct mtk_iommu_plat_data mt8192_data = { .m4u_plat = M4U_MT8192, .flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN | @@ -1796,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra}, { .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo}, { .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp}, + { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu}, + { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra}, + { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm}, { .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data}, { .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra}, { .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo}, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 10cc0b1197e8..c8d8eff5373d 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain) kfree(to_mtk_domain(domain)); } -static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) +static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); @@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device } static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); @@ -435,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } ret = iommu_fwspec_add_ids(dev, args->args, 1); @@ -641,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) if (larb_nr < 0) return larb_nr; + if (larb_nr > MTK_LARB_NR_MAX) + return -EINVAL; + for (i = 0; i < larb_nr; i++) { struct device_node *larbnode; struct platform_device *plarbdev; larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i); - if (!larbnode) - return -EINVAL; + if (!larbnode) { + ret = -EINVAL; + goto out_put_larbs; + } if (!of_device_is_available(larbnode)) { of_node_put(larbnode); @@ -657,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) plarbdev = of_find_device_by_node(larbnode); if (!plarbdev) { of_node_put(larbnode); - return -ENODEV; + ret = -ENODEV; + goto out_put_larbs; } if (!plarbdev->dev.driver) { of_node_put(larbnode); - return -EPROBE_DEFER; + put_device(&plarbdev->dev); + ret = -EPROBE_DEFER; + goto out_put_larbs; } data->larb_imu[i].dev = &plarbdev->dev; @@ -673,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) ret = mtk_iommu_v1_hw_init(data); if (ret) - return ret; + goto out_put_larbs; ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL, dev_name(&pdev->dev)); @@ -695,12 +708,17 @@ out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); out_clk_unprepare: clk_disable_unprepare(data->bclk); +out_put_larbs: + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + return ret; } static void mtk_iommu_v1_remove(struct platform_device *pdev) { struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev); + int i; iommu_device_sysfs_remove(&data->iommu); iommu_device_unregister(&data->iommu); @@ -708,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev) clk_disable_unprepare(data->bclk); devm_free_irq(&pdev->dev, data->irq, data); component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 5c6f5943f44b..768973b7e511 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain) odomain->iommus = NULL; } -static int -omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int omap_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); struct omap_iommu_domain *omap_domain = to_omap_domain(domain); @@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, } static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct omap_iommu_domain *omap_domain; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - omap_domain = to_omap_domain(domain); + omap_domain = to_omap_domain(old); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); @@ -1668,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev) } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-ENODEV); } oiommu = platform_get_drvdata(pdev); + put_device(&pdev->dev); if (!oiommu) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-EINVAL); } tmp->iommu_dev = oiommu; - tmp->dev = &pdev->dev; - - of_node_put(np); } dev_iommu_priv_set(dev, arch_data); diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h index 27697109ec79..50b39be61abc 100644 --- a/drivers/iommu/omap-iommu.h +++ b/drivers/iommu/omap-iommu.h @@ -88,7 +88,6 @@ struct omap_iommu { /** * struct omap_iommu_arch_data - omap iommu private data * @iommu_dev: handle of the OMAP iommu device - * @dev: handle of the iommu device * * This is an omap iommu private data object, which binds an iommu user * to its iommu device. This object should be placed at the iommu user's @@ -97,7 +96,6 @@ struct omap_iommu { */ struct omap_iommu_arch_data { struct omap_iommu *iommu_dev; - struct device *dev; }; struct cr_regs { diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index ebb22979075d..d9429097a2b5 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m } static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); struct riscv_iommu_device *iommu = dev_to_iommu(dev); @@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) } static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); @@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = { }; static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 0861dd469bd8..85f3667e797c 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -960,7 +960,8 @@ out_disable_clocks: } static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain; @@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = { }; static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); @@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); if (ret) return ret; @@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return 0; ret = rk_iommu_enable(iommu); - if (ret) - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); + if (ret) { + /* + * Note rk_iommu_identity_attach() might fail before physically + * attaching the dev to iommu->domain, in which case the actual + * old domain for this revert should be rk_identity_domain v.s. + * iommu->domain. Since rk_iommu_identity_attach() does not care + * about the old domain argument for now, this is not a problem. + */ + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, + iommu->domain)); + } pm_runtime_put(iommu->dev); diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index aa576736d60b..fe679850af28 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) } static int blocking_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain *s390_domain; @@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain, } static int s390_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); @@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); @@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void) subsys_initcall(s390_iommu_init); static int s390_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); u8 status; int cc; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index c7ca1d8a0b15..555d4505c747 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) } static int sprd_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); struct sprd_iommu_domain *dom = to_sprd_domain(domain); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index de10b569d9a9..90b26fe21817 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, } static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); struct sun50i_iommu_domain *sun50i_domain; @@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = { }; static int sun50i_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu; @@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); sun50i_iommu_attach_domain(iommu, sun50i_domain); @@ -839,6 +841,8 @@ static int sun50i_iommu_of_xlate(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev)); + put_device(&iommu_pdev->dev); + return iommu_fwspec_add_ids(dev, &id, 1); } diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 36cdd5fbab07..c391e7f2cde6 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, } static int tegra_smmu_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev_iommu_priv_get(dev); @@ -524,9 +524,9 @@ disable: } static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu_as *as; struct tegra_smmu *smmu; @@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, if (!fwspec) return -ENODEV; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - as = to_smmu_as(domain); + as = to_smmu_as(old); smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); @@ -830,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np) return NULL; mc = platform_get_drvdata(pdev); - if (!mc) { - put_device(&pdev->dev); + put_device(&pdev->dev); + if (!mc) return NULL; - } return mc->smmu; } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b39d6f134ab2..d314fa5cd847 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) return domain; } -static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; @@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int viommu_attach_identity_domain(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; diff --git a/drivers/net/ethernet/broadcom/bnge/Makefile b/drivers/net/ethernet/broadcom/bnge/Makefile index 6142d9c57f49..ea6596854e5c 100644 --- a/drivers/net/ethernet/broadcom/bnge/Makefile +++ b/drivers/net/ethernet/broadcom/bnge/Makefile @@ -9,4 +9,5 @@ bng_en-y := bnge_core.o \ bnge_rmem.o \ bnge_resc.o \ bnge_netdev.o \ - bnge_ethtool.o + bnge_ethtool.o \ + bnge_auxr.o diff --git a/drivers/net/ethernet/broadcom/bnge/bnge.h b/drivers/net/ethernet/broadcom/bnge/bnge.h index 7aed5f81cd51..411744894349 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge.h @@ -11,6 +11,7 @@ #include <linux/bnxt/hsi.h> #include "bnge_rmem.h" #include "bnge_resc.h" +#include "bnge_auxr.h" #define DRV_VER_MAJ 1 #define DRV_VER_MIN 15 @@ -22,6 +23,12 @@ enum board_idx { BCM57708, }; +struct bnge_auxr_priv { + struct auxiliary_device aux_dev; + struct bnge_auxr_dev *auxr_dev; + int id; +}; + struct bnge_pf_info { u16 fw_fid; u16 port_id; @@ -197,6 +204,9 @@ struct bnge_dev { struct bnge_irq *irq_tbl; u16 irqs_acquired; + + struct bnge_auxr_priv *aux_priv; + struct bnge_auxr_dev *auxr_dev; }; static inline bool bnge_is_roce_en(struct bnge_dev *bd) diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c new file mode 100644 index 000000000000..d64592b64e17 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/bitops.h> +#include <linux/irq.h> +#include <asm/byteorder.h> +#include <linux/bitmap.h> +#include <linux/auxiliary_bus.h> +#include <linux/bnxt/hsi.h> + +#include "bnge.h" +#include "bnge_hwrm.h" +#include "bnge_auxr.h" + +static DEFINE_IDA(bnge_aux_dev_ids); + +static void bnge_fill_msix_vecs(struct bnge_dev *bd, + struct bnge_msix_info *info) +{ + struct bnge_auxr_dev *auxr_dev = bd->auxr_dev; + int num_msix, i; + + if (!auxr_dev->auxr_info->msix_requested) { + dev_warn(bd->dev, "Requested MSI-X vectors not allocated\n"); + return; + } + num_msix = auxr_dev->auxr_info->msix_requested; + for (i = 0; i < num_msix; i++) { + info[i].vector = bd->irq_tbl[i].vector; + info[i].db_offset = bd->db_offset; + info[i].ring_idx = i; + } +} + +int bnge_register_dev(struct bnge_auxr_dev *auxr_dev, + void *handle) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct bnge_auxr_info *auxr_info; + int rc = 0; + + netdev_lock(bd->netdev); + mutex_lock(&auxr_dev->auxr_dev_lock); + if (!bd->irq_tbl) { + rc = -ENODEV; + goto exit; + } + + if (!bnge_aux_has_enough_resources(bd)) { + rc = -ENOMEM; + goto exit; + } + + auxr_info = auxr_dev->auxr_info; + auxr_info->handle = handle; + + auxr_info->msix_requested = bd->aux_num_msix; + + bnge_fill_msix_vecs(bd, bd->auxr_dev->msix_info); + auxr_dev->flags |= BNGE_ARDEV_MSIX_ALLOC; + +exit: + mutex_unlock(&auxr_dev->auxr_dev_lock); + netdev_unlock(bd->netdev); + return rc; +} +EXPORT_SYMBOL(bnge_register_dev); + +void bnge_unregister_dev(struct bnge_auxr_dev *auxr_dev) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct bnge_auxr_info *auxr_info; + + auxr_info = auxr_dev->auxr_info; + netdev_lock(bd->netdev); + mutex_lock(&auxr_dev->auxr_dev_lock); + if (auxr_info->msix_requested) + auxr_dev->flags &= ~BNGE_ARDEV_MSIX_ALLOC; + auxr_info->msix_requested = 0; + + mutex_unlock(&auxr_dev->auxr_dev_lock); + netdev_unlock(bd->netdev); +} +EXPORT_SYMBOL(bnge_unregister_dev); + +int bnge_send_msg(struct bnge_auxr_dev *auxr_dev, struct bnge_fw_msg *fw_msg) +{ + struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev); + struct output *resp; + struct input *req; + u32 resp_len; + int rc; + + rc = bnge_hwrm_req_init(bd, req, 0 /* don't care */); + if (rc) + return rc; + + rc = bnge_hwrm_req_replace(bd, req, fw_msg->msg, fw_msg->msg_len); + if (rc) + goto drop_req; + + bnge_hwrm_req_timeout(bd, req, fw_msg->timeout); + resp = bnge_hwrm_req_hold(bd, req); + rc = bnge_hwrm_req_send(bd, req); + resp_len = le16_to_cpu(resp->resp_len); + if (resp_len) { + if (fw_msg->resp_max_len < resp_len) + resp_len = fw_msg->resp_max_len; + + memcpy(fw_msg->resp, resp, resp_len); + } +drop_req: + bnge_hwrm_req_drop(bd, req); + return rc; +} +EXPORT_SYMBOL(bnge_send_msg); + +void bnge_rdma_aux_device_uninit(struct bnge_dev *bd) +{ + struct bnge_auxr_priv *aux_priv; + struct auxiliary_device *adev; + + /* Skip if no auxiliary device init was done. */ + if (!bd->aux_priv) + return; + + aux_priv = bd->aux_priv; + adev = &aux_priv->aux_dev; + auxiliary_device_uninit(adev); +} + +static void bnge_aux_dev_release(struct device *dev) +{ + struct bnge_auxr_priv *aux_priv = + container_of(dev, struct bnge_auxr_priv, aux_dev.dev); + struct bnge_dev *bd = pci_get_drvdata(aux_priv->auxr_dev->pdev); + + ida_free(&bnge_aux_dev_ids, aux_priv->id); + kfree(aux_priv->auxr_dev->auxr_info); + bd->auxr_dev = NULL; + kfree(aux_priv->auxr_dev); + kfree(aux_priv); + bd->aux_priv = NULL; +} + +void bnge_rdma_aux_device_del(struct bnge_dev *bd) +{ + if (!bd->auxr_dev) + return; + + auxiliary_device_delete(&bd->aux_priv->aux_dev); +} + +static void bnge_set_auxr_dev_info(struct bnge_auxr_dev *auxr_dev, + struct bnge_dev *bd) +{ + auxr_dev->pdev = bd->pdev; + auxr_dev->l2_db_size = bd->db_size; + auxr_dev->l2_db_size_nc = bd->db_size; + auxr_dev->l2_db_offset = bd->db_offset; + mutex_init(&auxr_dev->auxr_dev_lock); + + if (bd->flags & BNGE_EN_ROCE_V1) + auxr_dev->flags |= BNGE_ARDEV_ROCEV1_SUPP; + if (bd->flags & BNGE_EN_ROCE_V2) + auxr_dev->flags |= BNGE_ARDEV_ROCEV2_SUPP; + + auxr_dev->chip_num = bd->chip_num; + auxr_dev->hw_ring_stats_size = bd->hw_ring_stats_size; + auxr_dev->pf_port_id = bd->pf.port_id; + auxr_dev->en_state = bd->state; + auxr_dev->bar0 = bd->bar0; +} + +void bnge_rdma_aux_device_add(struct bnge_dev *bd) +{ + struct auxiliary_device *aux_dev; + int rc; + + if (!bd->auxr_dev) + return; + + aux_dev = &bd->aux_priv->aux_dev; + rc = auxiliary_device_add(aux_dev); + if (rc) { + dev_warn(bd->dev, "Failed to add auxiliary device for ROCE\n"); + auxiliary_device_uninit(aux_dev); + bd->flags &= ~BNGE_EN_ROCE; + } + + bd->auxr_dev->net = bd->netdev; +} + +void bnge_rdma_aux_device_init(struct bnge_dev *bd) +{ + struct auxiliary_device *aux_dev; + struct bnge_auxr_info *auxr_info; + struct bnge_auxr_priv *aux_priv; + struct bnge_auxr_dev *auxr_dev; + int rc; + + if (!bnge_is_roce_en(bd)) + return; + + aux_priv = kzalloc(sizeof(*aux_priv), GFP_KERNEL); + if (!aux_priv) + goto exit; + + aux_priv->id = ida_alloc(&bnge_aux_dev_ids, GFP_KERNEL); + if (aux_priv->id < 0) { + dev_warn(bd->dev, "ida alloc failed for aux device\n"); + kfree(aux_priv); + goto exit; + } + + aux_dev = &aux_priv->aux_dev; + aux_dev->id = aux_priv->id; + aux_dev->name = "rdma"; + aux_dev->dev.parent = &bd->pdev->dev; + aux_dev->dev.release = bnge_aux_dev_release; + + rc = auxiliary_device_init(aux_dev); + if (rc) { + ida_free(&bnge_aux_dev_ids, aux_priv->id); + kfree(aux_priv); + goto exit; + } + bd->aux_priv = aux_priv; + + auxr_dev = kzalloc(sizeof(*auxr_dev), GFP_KERNEL); + if (!auxr_dev) + goto aux_dev_uninit; + + aux_priv->auxr_dev = auxr_dev; + + auxr_info = kzalloc(sizeof(*auxr_info), GFP_KERNEL); + if (!auxr_info) + goto aux_dev_uninit; + + auxr_dev->auxr_info = auxr_info; + bd->auxr_dev = auxr_dev; + bnge_set_auxr_dev_info(auxr_dev, bd); + + return; + +aux_dev_uninit: + auxiliary_device_uninit(aux_dev); +exit: + bd->flags &= ~BNGE_EN_ROCE; +} diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h new file mode 100644 index 000000000000..6c5c15ef2b0a --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Broadcom */ + +#ifndef _BNGE_AUXR_H_ +#define _BNGE_AUXR_H_ + +#include <linux/auxiliary_bus.h> + +#define BNGE_MIN_ROCE_CP_RINGS 2 +#define BNGE_MIN_ROCE_STAT_CTXS 1 + +#define BNGE_MAX_ROCE_MSIX 64 + +struct hwrm_async_event_cmpl; +struct bnge; + +struct bnge_msix_info { + u32 vector; + u32 ring_idx; + u32 db_offset; +}; + +struct bnge_fw_msg { + void *msg; + int msg_len; + void *resp; + int resp_max_len; + int timeout; +}; + +struct bnge_auxr_info { + void *handle; + u16 msix_requested; +}; + +enum { + BNGE_ARDEV_ROCEV1_SUPP = BIT(0), + BNGE_ARDEV_ROCEV2_SUPP = BIT(1), + BNGE_ARDEV_MSIX_ALLOC = BIT(2), +}; + +#define BNGE_ARDEV_ROCE_SUPP (BNGE_ARDEV_ROCEV1_SUPP | \ + BNGE_ARDEV_ROCEV2_SUPP) + +struct bnge_auxr_dev { + struct net_device *net; + struct pci_dev *pdev; + void __iomem *bar0; + + struct bnge_msix_info msix_info[BNGE_MAX_ROCE_MSIX]; + + u32 flags; + + struct bnge_auxr_info *auxr_info; + + /* Doorbell BAR size in bytes mapped by L2 driver. */ + int l2_db_size; + /* Doorbell BAR size in bytes mapped as non-cacheable. */ + int l2_db_size_nc; + /* Doorbell offset in bytes within l2_db_size_nc. */ + int l2_db_offset; + + u16 chip_num; + u16 hw_ring_stats_size; + u16 pf_port_id; + unsigned long en_state; + + u16 auxr_num_msix_vec; + u16 auxr_num_ctxs; + + /* serialize auxr operations */ + struct mutex auxr_dev_lock; +}; + +void bnge_rdma_aux_device_uninit(struct bnge_dev *bdev); +void bnge_rdma_aux_device_del(struct bnge_dev *bdev); +void bnge_rdma_aux_device_add(struct bnge_dev *bdev); +void bnge_rdma_aux_device_init(struct bnge_dev *bdev); +int bnge_register_dev(struct bnge_auxr_dev *adev, + void *handle); +void bnge_unregister_dev(struct bnge_auxr_dev *adev); +int bnge_send_msg(struct bnge_auxr_dev *adev, struct bnge_fw_msg *fw_msg); + +#endif /* _BNGE_AUXR_H_ */ diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c index 2c72dd34d50d..c94e132bebc8 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c @@ -41,6 +41,11 @@ static void bnge_print_device_info(struct pci_dev *pdev, enum board_idx idx) bool bnge_aux_registered(struct bnge_dev *bd) { + struct bnge_auxr_dev *ba_dev = bd->auxr_dev; + + if (ba_dev && ba_dev->auxr_info->msix_requested) + return true; + return false; } @@ -312,16 +317,20 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) spin_lock_init(&bd->db_lock); #endif + bnge_rdma_aux_device_init(bd); + rc = bnge_alloc_irqs(bd); if (rc) { dev_err(&pdev->dev, "Error IRQ allocation rc = %d\n", rc); - goto err_config_uninit; + goto err_uninit_auxr; } rc = bnge_netdev_alloc(bd, max_irqs); if (rc) goto err_free_irq; + bnge_rdma_aux_device_add(bd); + pci_save_state(pdev); return 0; @@ -329,6 +338,9 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) err_free_irq: bnge_free_irqs(bd); +err_uninit_auxr: + bnge_rdma_aux_device_uninit(bd); + err_config_uninit: bnge_net_uninit_dflt_config(bd); @@ -354,10 +366,14 @@ static void bnge_remove_one(struct pci_dev *pdev) { struct bnge_dev *bd = pci_get_drvdata(pdev); + bnge_rdma_aux_device_del(bd); + bnge_netdev_free(bd); bnge_free_irqs(bd); + bnge_rdma_aux_device_uninit(bd); + bnge_net_uninit_dflt_config(bd); bnge_devlink_unregister(bd); diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c index 0f971af24142..c3087e5cd875 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c @@ -98,6 +98,46 @@ void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t gfp) ctx->gfp = gfp; } +int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req, + u32 len) +{ + struct bnge_hwrm_ctx *ctx = __hwrm_ctx_get(bd, req); + struct input *internal_req = req; + u16 req_type; + + if (!ctx) + return -EINVAL; + + if (len > BNGE_HWRM_CTX_OFFSET) + return -E2BIG; + + /* free any existing slices */ + ctx->allocated = BNGE_HWRM_DMA_SIZE - BNGE_HWRM_CTX_OFFSET; + if (ctx->slice_addr) { + dma_free_coherent(bd->dev, ctx->slice_size, + ctx->slice_addr, ctx->slice_handle); + ctx->slice_addr = NULL; + } + ctx->gfp = GFP_KERNEL; + + if ((bd->fw_cap & BNGE_FW_CAP_SHORT_CMD) || len > BNGE_HWRM_MAX_REQ_LEN) { + memcpy(internal_req, new_req, len); + } else { + internal_req->req_type = ((struct input *)new_req)->req_type; + ctx->req = new_req; + } + + ctx->req_len = len; + ctx->req->resp_addr = cpu_to_le64(ctx->dma_handle + + BNGE_HWRM_RESP_OFFSET); + + /* update sentinel for potentially new request type */ + req_type = le16_to_cpu(internal_req->req_type); + ctx->sentinel = bnge_cal_sentinel(ctx, req_type); + + return 0; +} + void bnge_hwrm_req_flags(struct bnge_dev *bd, void *req, enum bnge_hwrm_ctx_flags flags) { diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h index 83794a12cc81..6df629761d95 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h @@ -107,4 +107,6 @@ int bnge_hwrm_req_send_silent(struct bnge_dev *bd, void *req); void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t flags); void *bnge_hwrm_req_dma_slice(struct bnge_dev *bd, void *req, u32 size, dma_addr_t *dma); +int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req, + u32 len); #endif /* _BNGE_HWRM_H_ */ diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c index 62ebe03a0dcf..943df5f60f01 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c +++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c @@ -34,6 +34,18 @@ static unsigned int bnge_get_max_func_stat_ctxs(struct bnge_dev *bd) return bd->hw_resc.max_stat_ctxs; } +bool bnge_aux_has_enough_resources(struct bnge_dev *bd) +{ + unsigned int max_stat_ctxs; + + max_stat_ctxs = bnge_get_max_func_stat_ctxs(bd); + if (max_stat_ctxs <= BNGE_MIN_ROCE_STAT_CTXS || + bd->nq_nr_rings == max_stat_ctxs) + return false; + + return true; +} + static unsigned int bnge_get_max_func_cp_rings(struct bnge_dev *bd) { return bd->hw_resc.max_cp_rings; diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h index 0d6213b27580..b62a634669f6 100644 --- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h +++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h @@ -74,6 +74,7 @@ void bnge_net_uninit_dflt_config(struct bnge_dev *bd); void bnge_aux_init_dflt_config(struct bnge_dev *bd); u32 bnge_get_rxfh_indir_size(struct bnge_dev *bd); int bnge_cal_nr_rss_ctxs(u16 rx_rings); +bool bnge_aux_has_enough_resources(struct bnge_dev *bd); static inline u32 bnge_adjust_pow_two(u32 total_ent, u16 ent_per_blk) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index cb1011f6fd30..805daae9dd36 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -6444,7 +6444,6 @@ bnx2_reset_task(struct work_struct *work) if (!(pcicmd & PCI_COMMAND_MEMORY)) { /* in case PCI block has reset */ pci_restore_state(bp->pdev); - pci_save_state(bp->pdev); } rc = bnx2_init_nic(bp, 1); if (rc) { @@ -8718,7 +8717,6 @@ static pci_ers_result_t bnx2_io_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); if (netif_running(dev)) err = bnx2_init_nic(bp, 1); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index aca4267babc8..6a1cc2032bf3 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -14216,7 +14216,6 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); if (netif_running(dev)) bnx2x_set_power_state(bp, PCI_D0); diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index e21f7c6a6de7..75f66587983d 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18337,7 +18337,6 @@ static pci_ers_result_t tg3_io_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); if (!netdev || !netif_running(netdev)) { rc = PCI_ERS_RESULT_RECOVERED; diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index f92a3550e480..3b1321c8ed14 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2933,7 +2933,6 @@ static int t3_reenable_adapter(struct adapter *adapter) } pci_set_master(adapter->pdev); pci_restore_state(adapter->pdev); - pci_save_state(adapter->pdev); /* Free sge resources */ t3_free_sge_resources(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 66b8854e059f..043733c5c812 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5458,7 +5458,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev) if (!adap) { pci_restore_state(pdev); - pci_save_state(pdev); return PCI_ERS_RESULT_RECOVERED; } @@ -5473,7 +5472,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); if (t4_wait_dev_ready(adap->regs) < 0) return PCI_ERS_RESULT_DISCONNECT; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c index e11495b7ee98..7234618e8e81 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c @@ -160,7 +160,6 @@ static pci_ers_result_t hbg_pci_err_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); hbg_err_reset(priv); return PCI_ERS_RESULT_RECOVERED; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 116f3c92b5bc..ddbe2f7d8112 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -7195,7 +7195,6 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev) "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; } else { - pdev->state_saved = true; pci_restore_state(pdev); pci_set_master(pdev); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index ae5fe34659cf..d75b8a50413d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -2423,12 +2423,6 @@ static pci_ers_result_t fm10k_io_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - - /* After second error pci->state_saved is false, this - * resets it so EEH doesn't break. - */ - pci_save_state(pdev); - pci_wake_from_d3(pdev, false); result = PCI_ERS_RESULT_RECOVERED; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 50be0a60ae13..d8192aa23254 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16455,7 +16455,6 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_wake_from_d3(pdev, false); reg = rd32(&pf->hw, I40E_GLGEN_RTRIG); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 2533876f1a2f..4bb68e7a00f5 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5653,7 +5653,6 @@ static int ice_resume(struct device *dev) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); - pci_save_state(pdev); if (!pci_device_is_present(pdev)) return -ENODEV; @@ -5753,7 +5752,6 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_wake_from_d3(pdev, false); /* Check for life */ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 85f9589cc568..dbea37269d2c 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -9599,7 +9599,6 @@ static int __igb_resume(struct device *dev, bool rpm) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); - pci_save_state(pdev); if (!pci_device_is_present(pdev)) return -ENODEV; @@ -9754,7 +9753,6 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_enable_wake(pdev, PCI_D3hot, 0); pci_enable_wake(pdev, PCI_D3cold, 0); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 728d7ca5338b..7aafa60ba0c8 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7530,7 +7530,6 @@ static int __igc_resume(struct device *dev, bool rpm) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); - pci_save_state(pdev); if (!pci_device_is_present(pdev)) return -ENODEV; @@ -7667,7 +7666,6 @@ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev) } else { pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_enable_wake(pdev, PCI_D3hot, 0); pci_enable_wake(pdev, PCI_D3cold, 0); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4af3b3e71ff1..034618e79169 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -12298,7 +12298,6 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev) adapter->hw.hw_addr = adapter->io_addr; pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); pci_wake_from_d3(pdev, false); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 2de226951e19..4293f8e33f44 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -4368,7 +4368,6 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); return PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 024339ce41f1..1ab569ce3fcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -2137,7 +2137,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); err = wait_vital(pdev); if (err) { diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 861d98099c44..9240673c7533 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -581,7 +581,6 @@ static pci_ers_result_t fbnic_err_slot_reset(struct pci_dev *pdev) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); - pci_save_state(pdev); if (pci_enable_device_mem(pdev)) { dev_err(&pdev->dev, diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 9d70b51ca91d..e4c542fc6c2b 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3915,7 +3915,6 @@ static int lan743x_pm_resume(struct device *dev) pci_set_power_state(pdev, PCI_D0); pci_restore_state(pdev); - pci_save_state(pdev); /* Restore HW_CFG that was saved during pm suspend */ if (adapter->is_pci11x1x) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index e611ff7fa3fa..7be30a8df268 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3416,10 +3416,6 @@ static void myri10ge_watchdog(struct work_struct *work) * nic was resumed from power saving mode. */ pci_restore_state(mgp->pdev); - - /* save state again for accounting reasons */ - pci_save_state(mgp->pdev); - } else { /* if we get back -1's from our slot, perhaps somebody * powered off our card. Don't try to reset it in diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 5026b0263d43..1e55ccb4822b 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -3425,7 +3425,6 @@ static void s2io_reset(struct s2io_nic *sp) /* Restore the PCI state saved during initialization. */ pci_restore_state(sp->pdev); - pci_save_state(sp->pdev); pci_read_config_word(sp->pdev, 0x2, &val16); if (check_pci_device_id(val16) != (u16)PCI_ANY_ID) break; diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 67647f1880fb..f3c81c892786 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_PCI) += access.o bus.o probe.o host-bridge.o \ remove.o pci.o pci-driver.o search.o \ - rom.o setup-res.o irq.o vpd.o \ + rebar.o rom.o setup-res.o irq.o vpd.o \ setup-bus.o vc.o mmap.o devres.o obj-$(CONFIG_PCI) += msi/ diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index f26aec6ff588..9daf13ed3714 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -357,6 +357,9 @@ void pci_bus_add_device(struct pci_dev *dev) pci_proc_attach_device(dev); pci_bridge_d3_update(dev); + /* Save config space for error recoverability */ + pci_save_state(dev); + /* * If the PCI device is associated with a pwrctrl device with a * power supply, create a device link between the PCI device and diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 41748d083b93..c254d2b8bf17 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -146,7 +146,7 @@ config PCIE_HISI_ERR config PCI_IXP4XX bool "Intel IXP4xx PCI controller" - depends on ARM && OF + depends on OF depends on ARCH_IXP4XX || COMPILE_TEST default ARCH_IXP4XX help @@ -259,12 +259,20 @@ config PCIE_RCAR_EP config PCI_RCAR_GEN2 bool "Renesas R-Car Gen2 Internal PCI controller" - depends on ARCH_RENESAS || COMPILE_TEST - depends on ARM + depends on (ARCH_RENESAS && ARM) || COMPILE_TEST help Say Y here if you want internal PCI support on R-Car Gen2 SoC. - There are 3 internal PCI controllers available with a single - built-in EHCI/OHCI host controller present on each one. + Each internal PCI controller contains a single built-in EHCI/OHCI + host controller. + +config PCIE_RENESAS_RZG3S_HOST + bool "Renesas RZ/G3S PCIe host controller" + depends on ARCH_RENESAS || COMPILE_TEST + select MFD_SYSCON + select IRQ_MSI_LIB + help + Say Y here if you want PCIe host controller support on Renesas RZ/G3S + SoC. config PCIE_ROCKCHIP bool diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index 038ccbd9e3ba..229929a945c2 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o +obj-$(CONFIG_PCIE_RENESAS_RZG3S_HOST) += pcie-rzg3s-host.o obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o diff --git a/drivers/pci/controller/cadence/Kconfig b/drivers/pci/controller/cadence/Kconfig index 02a639e55fd8..9e651d545973 100644 --- a/drivers/pci/controller/cadence/Kconfig +++ b/drivers/pci/controller/cadence/Kconfig @@ -19,10 +19,10 @@ config PCIE_CADENCE_EP select PCIE_CADENCE config PCIE_CADENCE_PLAT - bool + tristate config PCIE_CADENCE_PLAT_HOST - bool "Cadence platform PCIe controller (host mode)" + tristate "Cadence platform PCIe controller (host mode)" depends on OF select PCIE_CADENCE_HOST select PCIE_CADENCE_PLAT @@ -32,7 +32,7 @@ config PCIE_CADENCE_PLAT_HOST vendors SoCs. config PCIE_CADENCE_PLAT_EP - bool "Cadence platform PCIe controller (endpoint mode)" + tristate "Cadence platform PCIe controller (endpoint mode)" depends on OF depends on PCI_ENDPOINT select PCIE_CADENCE_EP @@ -42,6 +42,21 @@ config PCIE_CADENCE_PLAT_EP endpoint mode. This PCIe controller may be embedded into many different vendors SoCs. +config PCI_SKY1_HOST + tristate "CIX SKY1 PCIe controller (host mode)" + depends on OF && (ARCH_CIX || COMPILE_TEST) + select PCIE_CADENCE_HOST + select PCI_ECAM + help + Say Y here if you want to support the CIX SKY1 PCIe platform + controller in host mode. CIX SKY1 PCIe controller uses Cadence + HPA (High Performance Architecture IP [Second generation of + Cadence PCIe IP]) + + This driver requires Cadence PCIe core infrastructure + (PCIE_CADENCE_HOST) and hardware platform adaptation layer + to function. + config PCIE_SG2042_HOST tristate "Sophgo SG2042 PCIe controller (host mode)" depends on OF && (ARCH_SOPHGO || COMPILE_TEST) diff --git a/drivers/pci/controller/cadence/Makefile b/drivers/pci/controller/cadence/Makefile index 5e23f8539ecc..b8ec1cecfaa8 100644 --- a/drivers/pci/controller/cadence/Makefile +++ b/drivers/pci/controller/cadence/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PCIE_CADENCE) += pcie-cadence.o -obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o -obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o +pcie-cadence-mod-y := pcie-cadence-hpa.o pcie-cadence.o +pcie-cadence-host-mod-y := pcie-cadence-host-common.o pcie-cadence-host.o pcie-cadence-host-hpa.o +pcie-cadence-ep-mod-y := pcie-cadence-ep.o + +obj-$(CONFIG_PCIE_CADENCE) = pcie-cadence-mod.o +obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host-mod.o +obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep-mod.o obj-$(CONFIG_PCIE_CADENCE_PLAT) += pcie-cadence-plat.o obj-$(CONFIG_PCI_J721E) += pci-j721e.o obj-$(CONFIG_PCIE_SG2042_HOST) += pcie-sg2042.o +obj-$(CONFIG_PCI_SKY1_HOST) += pci-sky1.o diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index 5bc5ab20aa6d..ecd1b0312400 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -477,9 +477,7 @@ static int j721e_pcie_probe(struct platform_device *pdev) struct j721e_pcie *pcie; struct cdns_pcie_rc *rc = NULL; struct cdns_pcie_ep *ep = NULL; - struct gpio_desc *gpiod; void __iomem *base; - struct clk *clk; u32 num_lanes; u32 mode; int ret; @@ -590,12 +588,12 @@ static int j721e_pcie_probe(struct platform_device *pdev) switch (mode) { case PCI_MODE_RC: - gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); - if (IS_ERR(gpiod)) { - ret = dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get reset GPIO\n"); + pcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(pcie->reset_gpio)) { + ret = dev_err_probe(dev, PTR_ERR(pcie->reset_gpio), + "Failed to get reset GPIO\n"); goto err_get_sync; } - pcie->reset_gpio = gpiod; ret = cdns_pcie_init_phy(dev, cdns_pcie); if (ret) { @@ -603,19 +601,13 @@ static int j721e_pcie_probe(struct platform_device *pdev) goto err_get_sync; } - clk = devm_clk_get_optional(dev, "pcie_refclk"); - if (IS_ERR(clk)) { - ret = dev_err_probe(dev, PTR_ERR(clk), "failed to get pcie_refclk\n"); + pcie->refclk = devm_clk_get_optional_enabled(dev, "pcie_refclk"); + if (IS_ERR(pcie->refclk)) { + ret = dev_err_probe(dev, PTR_ERR(pcie->refclk), + "failed to enable pcie_refclk\n"); goto err_pcie_setup; } - ret = clk_prepare_enable(clk); - if (ret) { - dev_err_probe(dev, ret, "failed to enable pcie_refclk\n"); - goto err_pcie_setup; - } - pcie->refclk = clk; - /* * Section 2.2 of the PCI Express Card Electromechanical * Specification (Revision 5.1) mandates that the deassertion @@ -623,16 +615,14 @@ static int j721e_pcie_probe(struct platform_device *pdev) * This shall ensure that the power and the reference clock * are stable. */ - if (gpiod) { + if (pcie->reset_gpio) { msleep(PCIE_T_PVPERL_MS); - gpiod_set_value_cansleep(gpiod, 1); + gpiod_set_value_cansleep(pcie->reset_gpio, 1); } ret = cdns_pcie_host_setup(rc); - if (ret < 0) { - clk_disable_unprepare(pcie->refclk); + if (ret < 0) goto err_pcie_setup; - } break; case PCI_MODE_EP: @@ -679,7 +669,6 @@ static void j721e_pcie_remove(struct platform_device *pdev) gpiod_set_value_cansleep(pcie->reset_gpio, 0); - clk_disable_unprepare(pcie->refclk); cdns_pcie_disable_phy(cdns_pcie); j721e_pcie_disable_link_irq(pcie); pm_runtime_put(dev); diff --git a/drivers/pci/controller/cadence/pci-sky1.c b/drivers/pci/controller/cadence/pci-sky1.c new file mode 100644 index 000000000000..d8c216dc120d --- /dev/null +++ b/drivers/pci/controller/cadence/pci-sky1.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe controller driver for CIX's sky1 SoCs + * + * Copyright 2025 Cix Technology Group Co., Ltd. + * Author: Hans Zhang <hans.zhang@cixtech.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/pci.h> +#include <linux/pci-ecam.h> +#include <linux/pci_ids.h> + +#include "pcie-cadence.h" +#include "pcie-cadence-host-common.h" + +#define PCI_VENDOR_ID_CIX 0x1f6c +#define PCI_DEVICE_ID_CIX_SKY1 0x0001 + +#define STRAP_REG(n) ((n) * 0x04) +#define STATUS_REG(n) ((n) * 0x04) +#define LINK_TRAINING_ENABLE BIT(0) +#define LINK_COMPLETE BIT(0) + +#define SKY1_IP_REG_BANK 0x1000 +#define SKY1_IP_CFG_CTRL_REG_BANK 0x4c00 +#define SKY1_IP_AXI_MASTER_COMMON 0xf000 +#define SKY1_AXI_SLAVE 0x9000 +#define SKY1_AXI_MASTER 0xb000 +#define SKY1_AXI_HLS_REGISTERS 0xc000 +#define SKY1_AXI_RAS_REGISTERS 0xe000 +#define SKY1_DTI_REGISTERS 0xd000 + +#define IP_REG_I_DBG_STS_0 0x420 + +struct sky1_pcie { + struct cdns_pcie *cdns_pcie; + struct cdns_pcie_rc *cdns_pcie_rc; + + struct resource *cfg_res; + struct resource *msg_res; + struct pci_config_window *cfg; + void __iomem *strap_base; + void __iomem *status_base; + void __iomem *reg_base; + void __iomem *cfg_base; + void __iomem *msg_base; +}; + +static int sky1_pcie_resource_get(struct platform_device *pdev, + struct sky1_pcie *pcie) +{ + struct device *dev = &pdev->dev; + struct resource *res; + void __iomem *base; + + base = devm_platform_ioremap_resource_byname(pdev, "reg"); + if (IS_ERR(base)) + return dev_err_probe(dev, PTR_ERR(base), + "unable to find \"reg\" registers\n"); + pcie->reg_base = base; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg"); + if (!res) + return dev_err_probe(dev, -ENODEV, "unable to get \"cfg\" resource\n"); + pcie->cfg_res = res; + + base = devm_platform_ioremap_resource_byname(pdev, "rcsu_strap"); + if (IS_ERR(base)) + return dev_err_probe(dev, PTR_ERR(base), + "unable to find \"rcsu_strap\" registers\n"); + pcie->strap_base = base; + + base = devm_platform_ioremap_resource_byname(pdev, "rcsu_status"); + if (IS_ERR(base)) + return dev_err_probe(dev, PTR_ERR(base), + "unable to find \"rcsu_status\" registers\n"); + pcie->status_base = base; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "msg"); + if (!res) + return dev_err_probe(dev, -ENODEV, "unable to get \"msg\" resource\n"); + pcie->msg_res = res; + pcie->msg_base = devm_ioremap_resource(dev, res); + if (IS_ERR(pcie->msg_base)) { + return dev_err_probe(dev, PTR_ERR(pcie->msg_base), + "unable to ioremap msg resource\n"); + } + + return 0; +} + +static int sky1_pcie_start_link(struct cdns_pcie *cdns_pcie) +{ + struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev); + u32 val; + + val = readl(pcie->strap_base + STRAP_REG(1)); + val |= LINK_TRAINING_ENABLE; + writel(val, pcie->strap_base + STRAP_REG(1)); + + return 0; +} + +static void sky1_pcie_stop_link(struct cdns_pcie *cdns_pcie) +{ + struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev); + u32 val; + + val = readl(pcie->strap_base + STRAP_REG(1)); + val &= ~LINK_TRAINING_ENABLE; + writel(val, pcie->strap_base + STRAP_REG(1)); +} + +static bool sky1_pcie_link_up(struct cdns_pcie *cdns_pcie) +{ + u32 val; + + val = cdns_pcie_hpa_readl(cdns_pcie, REG_BANK_IP_REG, + IP_REG_I_DBG_STS_0); + return val & LINK_COMPLETE; +} + +static const struct cdns_pcie_ops sky1_pcie_ops = { + .start_link = sky1_pcie_start_link, + .stop_link = sky1_pcie_stop_link, + .link_up = sky1_pcie_link_up, +}; + +static int sky1_pcie_probe(struct platform_device *pdev) +{ + struct cdns_plat_pcie_of_data *reg_off; + struct device *dev = &pdev->dev; + struct pci_host_bridge *bridge; + struct cdns_pcie *cdns_pcie; + struct resource_entry *bus; + struct cdns_pcie_rc *rc; + struct sky1_pcie *pcie; + int ret; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc)); + if (!bridge) + return -ENOMEM; + + ret = sky1_pcie_resource_get(pdev, pcie); + if (ret < 0) + return ret; + + bus = resource_list_first_type(&bridge->windows, IORESOURCE_BUS); + if (!bus) + return -ENODEV; + + pcie->cfg = pci_ecam_create(dev, pcie->cfg_res, bus->res, + &pci_generic_ecam_ops); + if (IS_ERR(pcie->cfg)) + return PTR_ERR(pcie->cfg); + + bridge->ops = (struct pci_ops *)&pci_generic_ecam_ops.pci_ops; + rc = pci_host_bridge_priv(bridge); + rc->ecam_supported = 1; + rc->cfg_base = pcie->cfg->win; + rc->cfg_res = &pcie->cfg->res; + + cdns_pcie = &rc->pcie; + cdns_pcie->dev = dev; + cdns_pcie->ops = &sky1_pcie_ops; + cdns_pcie->reg_base = pcie->reg_base; + cdns_pcie->msg_res = pcie->msg_res; + cdns_pcie->is_rc = 1; + + reg_off = devm_kzalloc(dev, sizeof(*reg_off), GFP_KERNEL); + if (!reg_off) + return -ENOMEM; + + reg_off->ip_reg_bank_offset = SKY1_IP_REG_BANK; + reg_off->ip_cfg_ctrl_reg_offset = SKY1_IP_CFG_CTRL_REG_BANK; + reg_off->axi_mstr_common_offset = SKY1_IP_AXI_MASTER_COMMON; + reg_off->axi_slave_offset = SKY1_AXI_SLAVE; + reg_off->axi_master_offset = SKY1_AXI_MASTER; + reg_off->axi_hls_offset = SKY1_AXI_HLS_REGISTERS; + reg_off->axi_ras_offset = SKY1_AXI_RAS_REGISTERS; + reg_off->axi_dti_offset = SKY1_DTI_REGISTERS; + cdns_pcie->cdns_pcie_reg_offsets = reg_off; + + pcie->cdns_pcie = cdns_pcie; + pcie->cdns_pcie_rc = rc; + pcie->cfg_base = rc->cfg_base; + bridge->sysdata = pcie->cfg; + + rc->vendor_id = PCI_VENDOR_ID_CIX; + rc->device_id = PCI_DEVICE_ID_CIX_SKY1; + rc->no_inbound_map = 1; + + dev_set_drvdata(dev, pcie); + + ret = cdns_pcie_hpa_host_setup(rc); + if (ret < 0) { + pci_ecam_free(pcie->cfg); + return ret; + } + + return 0; +} + +static const struct of_device_id of_sky1_pcie_match[] = { + { .compatible = "cix,sky1-pcie-host", }, + {}, +}; +MODULE_DEVICE_TABLE(of, of_sky1_pcie_match); + +static void sky1_pcie_remove(struct platform_device *pdev) +{ + struct sky1_pcie *pcie = platform_get_drvdata(pdev); + + pci_ecam_free(pcie->cfg); +} + +static struct platform_driver sky1_pcie_driver = { + .probe = sky1_pcie_probe, + .remove = sky1_pcie_remove, + .driver = { + .name = "sky1-pcie", + .of_match_table = of_sky1_pcie_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, +}; +module_platform_driver(sky1_pcie_driver); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PCIe controller driver for CIX's sky1 SoCs"); +MODULE_AUTHOR("Hans Zhang <hans.zhang@cixtech.com>"); diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.c b/drivers/pci/controller/cadence/pcie-cadence-host-common.c new file mode 100644 index 000000000000..15415d7f35ee --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cadence PCIe host controller library. + * + * Copyright (c) 2017 Cadence + * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com> + */ +#include <linux/delay.h> +#include <linux/kernel.h> +#include <linux/list_sort.h> +#include <linux/of_address.h> +#include <linux/of_pci.h> +#include <linux/platform_device.h> + +#include "pcie-cadence.h" +#include "pcie-cadence-host-common.h" + +#define LINK_RETRAIN_TIMEOUT HZ + +u64 bar_max_size[] = { + [RP_BAR0] = _ULL(128 * SZ_2G), + [RP_BAR1] = SZ_2G, + [RP_NO_BAR] = _BITULL(63), +}; +EXPORT_SYMBOL_GPL(bar_max_size); + +int cdns_pcie_host_training_complete(struct cdns_pcie *pcie) +{ + u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET; + unsigned long end_jiffies; + u16 lnk_stat; + + /* Wait for link training to complete. Exit after timeout. */ + end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT; + do { + lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA); + if (!(lnk_stat & PCI_EXP_LNKSTA_LT)) + break; + usleep_range(0, 1000); + } while (time_before(jiffies, end_jiffies)); + + if (!(lnk_stat & PCI_EXP_LNKSTA_LT)) + return 0; + + return -ETIMEDOUT; +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_training_complete); + +int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie, + cdns_pcie_linkup_func pcie_link_up) +{ + struct device *dev = pcie->dev; + int retries; + + /* Check if the link is up or not */ + for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) { + if (pcie_link_up(pcie)) { + dev_info(dev, "Link up\n"); + return 0; + } + usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); + } + + return -ETIMEDOUT; +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_wait_for_link); + +int cdns_pcie_retrain(struct cdns_pcie *pcie, + cdns_pcie_linkup_func pcie_link_up) +{ + u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET; + u16 lnk_stat, lnk_ctl; + int ret = 0; + + /* + * Set retrain bit if current speed is 2.5 GB/s, + * but the PCIe root port support is > 2.5 GB/s. + */ + + lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off + + PCI_EXP_LNKCAP)); + if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB) + return ret; + + lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA); + if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) { + lnk_ctl = cdns_pcie_rp_readw(pcie, + pcie_cap_off + PCI_EXP_LNKCTL); + lnk_ctl |= PCI_EXP_LNKCTL_RL; + cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL, + lnk_ctl); + + ret = cdns_pcie_host_training_complete(pcie); + if (ret) + return ret; + + ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up); + } + return ret; +} +EXPORT_SYMBOL_GPL(cdns_pcie_retrain); + +int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc, + cdns_pcie_linkup_func pcie_link_up) +{ + struct cdns_pcie *pcie = &rc->pcie; + int ret; + + ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up); + + /* + * Retrain link for Gen2 training defect + * if quirk flag is set. + */ + if (!ret && rc->quirk_retrain_flag) + ret = cdns_pcie_retrain(pcie, pcie_link_up); + + return ret; +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_start_link); + +enum cdns_pcie_rp_bar +cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size) +{ + enum cdns_pcie_rp_bar bar, sel_bar; + + sel_bar = RP_BAR_UNDEFINED; + for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) { + if (!rc->avail_ib_bar[bar]) + continue; + + if (size <= bar_max_size[bar]) { + if (sel_bar == RP_BAR_UNDEFINED) { + sel_bar = bar; + continue; + } + + if (bar_max_size[bar] < bar_max_size[sel_bar]) + sel_bar = bar; + } + } + + return sel_bar; +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_find_min_bar); + +enum cdns_pcie_rp_bar +cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size) +{ + enum cdns_pcie_rp_bar bar, sel_bar; + + sel_bar = RP_BAR_UNDEFINED; + for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) { + if (!rc->avail_ib_bar[bar]) + continue; + + if (size >= bar_max_size[bar]) { + if (sel_bar == RP_BAR_UNDEFINED) { + sel_bar = bar; + continue; + } + + if (bar_max_size[bar] > bar_max_size[sel_bar]) + sel_bar = bar; + } + } + + return sel_bar; +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_find_max_bar); + +int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct resource_entry *entry1, *entry2; + + entry1 = container_of(a, struct resource_entry, node); + entry2 = container_of(b, struct resource_entry, node); + + return resource_size(entry2->res) - resource_size(entry1->res); +} +EXPORT_SYMBOL_GPL(cdns_pcie_host_dma_ranges_cmp); + +int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc, + struct resource_entry *entry, + cdns_pcie_host_bar_ib_cfg pci_host_ib_config) +{ + struct cdns_pcie *pcie = &rc->pcie; + struct device *dev = pcie->dev; + u64 cpu_addr, size, winsize; + enum cdns_pcie_rp_bar bar; + unsigned long flags; + int ret; + + cpu_addr = entry->res->start; + flags = entry->res->flags; + size = resource_size(entry->res); + + while (size > 0) { + /* + * Try to find a minimum BAR whose size is greater than + * or equal to the remaining resource_entry size. This will + * fail if the size of each of the available BARs is less than + * the remaining resource_entry size. + * + * If a minimum BAR is found, IB ATU will be configured and + * exited. + */ + bar = cdns_pcie_host_find_min_bar(rc, size); + if (bar != RP_BAR_UNDEFINED) { + ret = pci_host_ib_config(rc, bar, cpu_addr, size, flags); + if (ret) + dev_err(dev, "IB BAR: %d config failed\n", bar); + return ret; + } + + /* + * If the control reaches here, it would mean the remaining + * resource_entry size cannot be fitted in a single BAR. So we + * find a maximum BAR whose size is less than or equal to the + * remaining resource_entry size and split the resource entry + * so that part of resource entry is fitted inside the maximum + * BAR. The remaining size would be fitted during the next + * iteration of the loop. + * + * If a maximum BAR is not found, there is no way we can fit + * this resource_entry, so we error out. + */ + bar = cdns_pcie_host_find_max_bar(rc, size); + if (bar == RP_BAR_UNDEFINED) { + dev_err(dev, "No free BAR to map cpu_addr %llx\n", + cpu_addr); + return -EINVAL; + } + + winsize = bar_max_size[bar]; + ret = pci_host_ib_config(rc, bar, cpu_addr, winsize, flags); + if (ret) { + dev_err(dev, "IB BAR: %d config failed\n", bar); + return ret; + } + + size -= winsize; + cpu_addr += winsize; + } + + return 0; +} + +int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc, + cdns_pcie_host_bar_ib_cfg pci_host_ib_config) +{ + struct cdns_pcie *pcie = &rc->pcie; + struct device *dev = pcie->dev; + struct device_node *np = dev->of_node; + struct pci_host_bridge *bridge; + struct resource_entry *entry; + u32 no_bar_nbits = 32; + int err; + + bridge = pci_host_bridge_from_priv(rc); + if (!bridge) + return -ENOMEM; + + if (list_empty(&bridge->dma_ranges)) { + of_property_read_u32(np, "cdns,no-bar-match-nbits", + &no_bar_nbits); + err = pci_host_ib_config(rc, RP_NO_BAR, 0x0, (u64)1 << no_bar_nbits, 0); + if (err) + dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR); + return err; + } + + list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp); + + resource_list_for_each_entry(entry, &bridge->dma_ranges) { + err = cdns_pcie_host_bar_config(rc, entry, pci_host_ib_config); + if (err) { + dev_err(dev, "Fail to configure IB using dma-ranges\n"); + return err; + } + } + + return 0; +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Cadence PCIe host controller driver"); diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.h b/drivers/pci/controller/cadence/pcie-cadence-host-common.h new file mode 100644 index 000000000000..fe7d4202a8b6 --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cadence PCIe Host controller driver. + * + * Copyright (c) 2017 Cadence + * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com> + */ +#ifndef _PCIE_CADENCE_HOST_COMMON_H +#define _PCIE_CADENCE_HOST_COMMON_H + +#include <linux/kernel.h> +#include <linux/pci.h> + +extern u64 bar_max_size[]; + +typedef int (*cdns_pcie_host_bar_ib_cfg)(struct cdns_pcie_rc *, + enum cdns_pcie_rp_bar, + u64, + u64, + unsigned long); +typedef bool (*cdns_pcie_linkup_func)(struct cdns_pcie *); + +int cdns_pcie_host_training_complete(struct cdns_pcie *pcie); +int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie, + cdns_pcie_linkup_func pcie_link_up); +int cdns_pcie_retrain(struct cdns_pcie *pcie, cdns_pcie_linkup_func pcie_linkup_func); +int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc, + cdns_pcie_linkup_func pcie_link_up); +enum cdns_pcie_rp_bar +cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size); +enum cdns_pcie_rp_bar +cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size); +int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a, + const struct list_head *b); +int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc, + enum cdns_pcie_rp_bar bar, + u64 cpu_addr, + u64 size, + unsigned long flags); +int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc, + struct resource_entry *entry, + cdns_pcie_host_bar_ib_cfg pci_host_ib_config); +int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc, + cdns_pcie_host_bar_ib_cfg pci_host_ib_config); + +#endif /* _PCIE_CADENCE_HOST_COMMON_H */ diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c new file mode 100644 index 000000000000..0f540bed58e8 --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cadence PCIe host controller driver. + * + * Copyright (c) 2024, Cadence Design Systems + * Author: Manikandan K Pillai <mpillai@cadence.com> + */ +#include <linux/delay.h> +#include <linux/kernel.h> +#include <linux/list_sort.h> +#include <linux/of_address.h> +#include <linux/of_pci.h> +#include <linux/of_irq.h> +#include <linux/platform_device.h> + +#include "pcie-cadence.h" +#include "pcie-cadence-host-common.h" + +static u8 bar_aperture_mask[] = { + [RP_BAR0] = 0x3F, + [RP_BAR1] = 0x3F, +}; + +void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn, + int where) +{ + struct pci_host_bridge *bridge = pci_find_host_bridge(bus); + struct cdns_pcie_rc *rc = pci_host_bridge_priv(bridge); + struct cdns_pcie *pcie = &rc->pcie; + unsigned int busn = bus->number; + u32 addr0, desc0, desc1, ctrl0; + u32 regval; + + if (pci_is_root_bus(bus)) { + /* + * Only the root port (devfn == 0) is connected to this bus. + * All other PCI devices are behind some bridge hence on another + * bus. + */ + if (devfn) + return NULL; + + return pcie->reg_base + (where & 0xfff); + } + + /* Clear AXI link-down status */ + regval = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN, + (regval & ~GENMASK(0, 0))); + + /* Update Output registers for AXI region 0 */ + addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(12) | + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) | + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(busn); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(0), addr0); + + desc1 = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0)); + desc1 &= ~CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK; + desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0); + ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS | + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN; + + if (busn == bridge->busnr + 1) + desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0; + else + desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1; + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), desc0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), ctrl0); + + return rc->cfg_base + (where & 0xfff); +} + +static struct pci_ops cdns_pcie_hpa_host_ops = { + .map_bus = cdns_pci_hpa_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, +}; + +static void cdns_pcie_hpa_host_enable_ptm_response(struct cdns_pcie *pcie) +{ + u32 val; + + val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL); + cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL, + val | CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN); +} + +static int cdns_pcie_hpa_host_bar_ib_config(struct cdns_pcie_rc *rc, + enum cdns_pcie_rp_bar bar, + u64 cpu_addr, u64 size, + unsigned long flags) +{ + struct cdns_pcie *pcie = &rc->pcie; + u32 addr0, addr1, aperture, value; + + if (!rc->avail_ib_bar[bar]) + return -ENODEV; + + rc->avail_ib_bar[bar] = false; + + aperture = ilog2(size); + if (bar == RP_NO_BAR) { + addr0 = CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(aperture) | + (lower_32_bits(cpu_addr) & GENMASK(31, 8)); + addr1 = upper_32_bits(cpu_addr); + } else { + addr0 = lower_32_bits(cpu_addr); + addr1 = upper_32_bits(cpu_addr); + } + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER, + CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar), addr0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER, + CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar), addr1); + + if (bar == RP_NO_BAR) + bar = (enum cdns_pcie_rp_bar)BAR_0; + + value = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG); + value &= ~(HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) | + HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) | + HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) | + HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) | + HPA_LM_RC_BAR_CFG_APERTURE(bar, bar_aperture_mask[bar] + 7)); + if (size + cpu_addr >= SZ_4G) { + value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar); + if ((flags & IORESOURCE_PREFETCH)) + value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar); + } else { + value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar); + if ((flags & IORESOURCE_PREFETCH)) + value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar); + } + + value |= HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture); + cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG, value); + + return 0; +} + +static int cdns_pcie_hpa_host_init_root_port(struct cdns_pcie_rc *rc) +{ + struct cdns_pcie *pcie = &rc->pcie; + u32 value, ctrl; + + /* + * Set the root port BAR configuration register: + * - disable both BAR0 and BAR1 + * - enable Prefetchable Memory Base and Limit registers in type 1 + * config space (64 bits) + * - enable IO Base and Limit registers in type 1 config + * space (32 bits) + */ + + ctrl = CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED; + value = CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(ctrl) | + CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(ctrl) | + CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE | + CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS | + CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE | + CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS; + cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, + CDNS_PCIE_HPA_LM_RC_BAR_CFG, value); + + if (rc->vendor_id != 0xffff) + cdns_pcie_hpa_rp_writew(pcie, PCI_VENDOR_ID, rc->vendor_id); + + if (rc->device_id != 0xffff) + cdns_pcie_hpa_rp_writew(pcie, PCI_DEVICE_ID, rc->device_id); + + cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_REVISION, 0); + cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_PROG, 0); + cdns_pcie_hpa_rp_writew(pcie, PCI_CLASS_DEVICE, PCI_CLASS_BRIDGE_PCI); + + /* Enable bus mastering */ + value = cdns_pcie_hpa_readl(pcie, REG_BANK_RP, PCI_COMMAND); + value |= (PCI_COMMAND_MEMORY | PCI_COMMAND_IO | PCI_COMMAND_MASTER); + cdns_pcie_hpa_writel(pcie, REG_BANK_RP, PCI_COMMAND, value); + return 0; +} + +static void cdns_pcie_hpa_create_region_for_cfg(struct cdns_pcie_rc *rc) +{ + struct cdns_pcie *pcie = &rc->pcie; + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc); + struct resource *cfg_res = rc->cfg_res; + struct resource_entry *entry; + u64 cpu_addr = cfg_res->start; + u32 addr0, addr1, desc1; + int busnr = 0; + + entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS); + if (entry) + busnr = entry->res->start; + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_TAG_MANAGEMENT, 0x01000000); + /* + * Reserve region 0 for PCI configure space accesses: + * OB_REGION_PCI_ADDR0 and OB_REGION_DESC0 are updated dynamically by + * cdns_pci_map_bus(), other region registers are set here once for all + */ + desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(0), 0x0); + /* Type-1 CFG */ + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), 0x05000000); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1); + + addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(12) | + (lower_32_bits(cpu_addr) & GENMASK(31, 8)); + addr1 = upper_32_bits(cpu_addr); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(0), addr0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(0), addr1); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), 0x06000000); +} + +static int cdns_pcie_hpa_host_init_address_translation(struct cdns_pcie_rc *rc) +{ + struct cdns_pcie *pcie = &rc->pcie; + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc); + struct resource_entry *entry; + int r = 0, busnr = 0; + + if (!rc->ecam_supported) + cdns_pcie_hpa_create_region_for_cfg(rc); + + entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS); + if (entry) + busnr = entry->res->start; + + r++; + if (pcie->msg_res) { + cdns_pcie_hpa_set_outbound_region_for_normal_msg(pcie, busnr, 0, r, + pcie->msg_res->start); + + r++; + } + resource_list_for_each_entry(entry, &bridge->windows) { + struct resource *res = entry->res; + u64 pci_addr = res->start - entry->offset; + + if (resource_type(res) == IORESOURCE_IO) + cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r, + true, + pci_pio_to_address(res->start), + pci_addr, + resource_size(res)); + else + cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r, + false, + res->start, + pci_addr, + resource_size(res)); + + r++; + } + + if (rc->no_inbound_map) + return 0; + else + return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_hpa_host_bar_ib_config); +} + +static int cdns_pcie_hpa_host_init(struct cdns_pcie_rc *rc) +{ + int err; + + err = cdns_pcie_hpa_host_init_root_port(rc); + if (err) + return err; + + return cdns_pcie_hpa_host_init_address_translation(rc); +} + +int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc) +{ + struct cdns_pcie *pcie = &rc->pcie; + struct device *dev = rc->pcie.dev; + int ret; + + if (rc->quirk_detect_quiet_flag) + cdns_pcie_hpa_detect_quiet_min_delay_set(&rc->pcie); + + cdns_pcie_hpa_host_enable_ptm_response(pcie); + + ret = cdns_pcie_start_link(pcie); + if (ret) { + dev_err(dev, "Failed to start link\n"); + return ret; + } + + ret = cdns_pcie_host_wait_for_link(pcie, cdns_pcie_hpa_link_up); + if (ret) + dev_dbg(dev, "PCIe link never came up\n"); + + return ret; +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_link_setup); + +int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc) +{ + struct device *dev = rc->pcie.dev; + struct platform_device *pdev = to_platform_device(dev); + struct pci_host_bridge *bridge; + enum cdns_pcie_rp_bar bar; + struct cdns_pcie *pcie; + struct resource *res; + int ret; + + bridge = pci_host_bridge_from_priv(rc); + if (!bridge) + return -ENOMEM; + + pcie = &rc->pcie; + pcie->is_rc = true; + + if (!pcie->reg_base) { + pcie->reg_base = devm_platform_ioremap_resource_byname(pdev, "reg"); + if (IS_ERR(pcie->reg_base)) { + dev_err(dev, "missing \"reg\"\n"); + return PTR_ERR(pcie->reg_base); + } + } + + /* ECAM config space is remapped at glue layer */ + if (!rc->cfg_base) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg"); + rc->cfg_base = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(rc->cfg_base)) + return PTR_ERR(rc->cfg_base); + rc->cfg_res = res; + } + + /* Put EROM Bar aperture to 0 */ + cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_EROM, 0x0); + + ret = cdns_pcie_hpa_host_link_setup(rc); + if (ret) + return ret; + + for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) + rc->avail_ib_bar[bar] = true; + + ret = cdns_pcie_hpa_host_init(rc); + if (ret) + return ret; + + if (!bridge->ops) + bridge->ops = &cdns_pcie_hpa_host_ops; + + return pci_host_probe(bridge); +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_setup); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Cadence PCIe host controller driver"); diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c index fffd63d6665e..db3154c1eccb 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-host.c +++ b/drivers/pci/controller/cadence/pcie-cadence-host.c @@ -12,14 +12,7 @@ #include <linux/platform_device.h> #include "pcie-cadence.h" - -#define LINK_RETRAIN_TIMEOUT HZ - -static u64 bar_max_size[] = { - [RP_BAR0] = _ULL(128 * SZ_2G), - [RP_BAR1] = SZ_2G, - [RP_NO_BAR] = _BITULL(63), -}; +#include "pcie-cadence-host-common.h" static u8 bar_aperture_mask[] = { [RP_BAR0] = 0x1F, @@ -81,77 +74,6 @@ static struct pci_ops cdns_pcie_host_ops = { .write = pci_generic_config_write, }; -static int cdns_pcie_host_training_complete(struct cdns_pcie *pcie) -{ - u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET; - unsigned long end_jiffies; - u16 lnk_stat; - - /* Wait for link training to complete. Exit after timeout. */ - end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT; - do { - lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA); - if (!(lnk_stat & PCI_EXP_LNKSTA_LT)) - break; - usleep_range(0, 1000); - } while (time_before(jiffies, end_jiffies)); - - if (!(lnk_stat & PCI_EXP_LNKSTA_LT)) - return 0; - - return -ETIMEDOUT; -} - -static int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie) -{ - struct device *dev = pcie->dev; - int retries; - - /* Check if the link is up or not */ - for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) { - if (cdns_pcie_link_up(pcie)) { - dev_info(dev, "Link up\n"); - return 0; - } - usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); - } - - return -ETIMEDOUT; -} - -static int cdns_pcie_retrain(struct cdns_pcie *pcie) -{ - u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET; - u16 lnk_stat, lnk_ctl; - int ret = 0; - - /* - * Set retrain bit if current speed is 2.5 GB/s, - * but the PCIe root port support is > 2.5 GB/s. - */ - - lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off + - PCI_EXP_LNKCAP)); - if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB) - return ret; - - lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA); - if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) { - lnk_ctl = cdns_pcie_rp_readw(pcie, - pcie_cap_off + PCI_EXP_LNKCTL); - lnk_ctl |= PCI_EXP_LNKCTL_RL; - cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL, - lnk_ctl); - - ret = cdns_pcie_host_training_complete(pcie); - if (ret) - return ret; - - ret = cdns_pcie_host_wait_for_link(pcie); - } - return ret; -} - static void cdns_pcie_host_disable_ptm_response(struct cdns_pcie *pcie) { u32 val; @@ -168,23 +90,6 @@ static void cdns_pcie_host_enable_ptm_response(struct cdns_pcie *pcie) cdns_pcie_writel(pcie, CDNS_PCIE_LM_PTM_CTRL, val | CDNS_PCIE_LM_TPM_CTRL_PTMRSEN); } -static int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc) -{ - struct cdns_pcie *pcie = &rc->pcie; - int ret; - - ret = cdns_pcie_host_wait_for_link(pcie); - - /* - * Retrain link for Gen2 training defect - * if quirk flag is set. - */ - if (!ret && rc->quirk_retrain_flag) - ret = cdns_pcie_retrain(pcie); - - return ret; -} - static void cdns_pcie_host_deinit_root_port(struct cdns_pcie_rc *rc) { struct cdns_pcie *pcie = &rc->pcie; @@ -245,10 +150,11 @@ static int cdns_pcie_host_init_root_port(struct cdns_pcie_rc *rc) return 0; } -static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc, - enum cdns_pcie_rp_bar bar, - u64 cpu_addr, u64 size, - unsigned long flags) +int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc, + enum cdns_pcie_rp_bar bar, + u64 cpu_addr, + u64 size, + unsigned long flags) { struct cdns_pcie *pcie = &rc->pcie; u32 addr0, addr1, aperture, value; @@ -290,137 +196,6 @@ static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc, return 0; } -static enum cdns_pcie_rp_bar -cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size) -{ - enum cdns_pcie_rp_bar bar, sel_bar; - - sel_bar = RP_BAR_UNDEFINED; - for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) { - if (!rc->avail_ib_bar[bar]) - continue; - - if (size <= bar_max_size[bar]) { - if (sel_bar == RP_BAR_UNDEFINED) { - sel_bar = bar; - continue; - } - - if (bar_max_size[bar] < bar_max_size[sel_bar]) - sel_bar = bar; - } - } - - return sel_bar; -} - -static enum cdns_pcie_rp_bar -cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size) -{ - enum cdns_pcie_rp_bar bar, sel_bar; - - sel_bar = RP_BAR_UNDEFINED; - for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) { - if (!rc->avail_ib_bar[bar]) - continue; - - if (size >= bar_max_size[bar]) { - if (sel_bar == RP_BAR_UNDEFINED) { - sel_bar = bar; - continue; - } - - if (bar_max_size[bar] > bar_max_size[sel_bar]) - sel_bar = bar; - } - } - - return sel_bar; -} - -static int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc, - struct resource_entry *entry) -{ - u64 cpu_addr, pci_addr, size, winsize; - struct cdns_pcie *pcie = &rc->pcie; - struct device *dev = pcie->dev; - enum cdns_pcie_rp_bar bar; - unsigned long flags; - int ret; - - cpu_addr = entry->res->start; - pci_addr = entry->res->start - entry->offset; - flags = entry->res->flags; - size = resource_size(entry->res); - - if (entry->offset) { - dev_err(dev, "PCI addr: %llx must be equal to CPU addr: %llx\n", - pci_addr, cpu_addr); - return -EINVAL; - } - - while (size > 0) { - /* - * Try to find a minimum BAR whose size is greater than - * or equal to the remaining resource_entry size. This will - * fail if the size of each of the available BARs is less than - * the remaining resource_entry size. - * If a minimum BAR is found, IB ATU will be configured and - * exited. - */ - bar = cdns_pcie_host_find_min_bar(rc, size); - if (bar != RP_BAR_UNDEFINED) { - ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr, - size, flags); - if (ret) - dev_err(dev, "IB BAR: %d config failed\n", bar); - return ret; - } - - /* - * If the control reaches here, it would mean the remaining - * resource_entry size cannot be fitted in a single BAR. So we - * find a maximum BAR whose size is less than or equal to the - * remaining resource_entry size and split the resource entry - * so that part of resource entry is fitted inside the maximum - * BAR. The remaining size would be fitted during the next - * iteration of the loop. - * If a maximum BAR is not found, there is no way we can fit - * this resource_entry, so we error out. - */ - bar = cdns_pcie_host_find_max_bar(rc, size); - if (bar == RP_BAR_UNDEFINED) { - dev_err(dev, "No free BAR to map cpu_addr %llx\n", - cpu_addr); - return -EINVAL; - } - - winsize = bar_max_size[bar]; - ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr, winsize, - flags); - if (ret) { - dev_err(dev, "IB BAR: %d config failed\n", bar); - return ret; - } - - size -= winsize; - cpu_addr += winsize; - } - - return 0; -} - -static int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct resource_entry *entry1, *entry2; - - entry1 = container_of(a, struct resource_entry, node); - entry2 = container_of(b, struct resource_entry, node); - - return resource_size(entry2->res) - resource_size(entry1->res); -} - static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc) { struct cdns_pcie *pcie = &rc->pcie; @@ -447,43 +222,6 @@ static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc) } } -static int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc) -{ - struct cdns_pcie *pcie = &rc->pcie; - struct device *dev = pcie->dev; - struct device_node *np = dev->of_node; - struct pci_host_bridge *bridge; - struct resource_entry *entry; - u32 no_bar_nbits = 32; - int err; - - bridge = pci_host_bridge_from_priv(rc); - if (!bridge) - return -ENOMEM; - - if (list_empty(&bridge->dma_ranges)) { - of_property_read_u32(np, "cdns,no-bar-match-nbits", - &no_bar_nbits); - err = cdns_pcie_host_bar_ib_config(rc, RP_NO_BAR, 0x0, - (u64)1 << no_bar_nbits, 0); - if (err) - dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR); - return err; - } - - list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp); - - resource_list_for_each_entry(entry, &bridge->dma_ranges) { - err = cdns_pcie_host_bar_config(rc, entry); - if (err) { - dev_err(dev, "Fail to configure IB using dma-ranges\n"); - return err; - } - } - - return 0; -} - static void cdns_pcie_host_deinit_address_translation(struct cdns_pcie_rc *rc) { struct cdns_pcie *pcie = &rc->pcie; @@ -561,7 +299,7 @@ static int cdns_pcie_host_init_address_translation(struct cdns_pcie_rc *rc) r++; } - return cdns_pcie_host_map_dma_ranges(rc); + return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_host_bar_ib_config); } static void cdns_pcie_host_deinit(struct cdns_pcie_rc *rc) @@ -607,7 +345,7 @@ int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc) return ret; } - ret = cdns_pcie_host_start_link(rc); + ret = cdns_pcie_host_start_link(rc, cdns_pcie_link_up); if (ret) dev_dbg(dev, "PCIe link never came up\n"); diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h new file mode 100644 index 000000000000..026e131600de --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cadence PCIe controller driver. + * + * Copyright (c) 2024, Cadence Design Systems + * Author: Manikandan K Pillai <mpillai@cadence.com> + */ +#ifndef _PCIE_CADENCE_HPA_REGS_H +#define _PCIE_CADENCE_HPA_REGS_H + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/pci-epf.h> +#include <linux/phy/phy.h> +#include <linux/bitfield.h> + +/* High Performance Architecture (HPA) PCIe controller registers */ +#define CDNS_PCIE_HPA_IP_REG_BANK 0x01000000 +#define CDNS_PCIE_HPA_IP_CFG_CTRL_REG_BANK 0x01003C00 +#define CDNS_PCIE_HPA_IP_AXI_MASTER_COMMON 0x02020000 + +/* Address Translation Registers */ +#define CDNS_PCIE_HPA_AXI_SLAVE 0x03000000 +#define CDNS_PCIE_HPA_AXI_MASTER 0x03002000 + +/* Root Port register base address */ +#define CDNS_PCIE_HPA_RP_BASE 0x0 + +#define CDNS_PCIE_HPA_LM_ID 0x1420 + +/* Endpoint Function BARs */ +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG(bar, fn) \ + (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(fn) : \ + CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(fn)) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(pfn) (0x4000 * (pfn)) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(pfn) ((0x4000 * (pfn)) + 0x04) +#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG(bar, fn) \ + (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(fn) : \ + CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(fn)) +#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(vfn) ((0x4000 * (vfn)) + 0x08) +#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(vfn) ((0x4000 * (vfn)) + 0x0C) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(f) \ + (GENMASK(5, 0) << (0x4 + (f) * 10)) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \ + (((a) << (4 + ((b) * 10))) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b))) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(f) \ + (GENMASK(3, 0) << ((f) * 10)) +#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \ + (((c) << ((b) * 10)) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b))) + +/* Endpoint Function Configuration Register */ +#define CDNS_PCIE_HPA_LM_EP_FUNC_CFG 0x02C0 + +/* Root Complex BAR Configuration Register */ +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG 0x14 +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(9, 4) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE(a) \ + FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK, a) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(3, 0) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(c) \ + FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK, c) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(19, 14) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE(a) \ + FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK, a) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(13, 10) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(c) \ + FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK, c) + +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(20) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(21) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE BIT(22) +#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS BIT(23) + +/* BAR control values applicable to both Endpoint Function and Root Complex */ +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED 0x0 +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS 0x3 +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS 0x1 +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x9 +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS 0x5 +#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0xD + +#define HPA_LM_RC_BAR_CFG_CTRL_DISABLED(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \ + (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << ((bar) * 10)) +#define HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture) \ + (((aperture) - 7) << (((bar) * 10) + 4)) + +#define CDNS_PCIE_HPA_LM_PTM_CTRL 0x0520 +#define CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN BIT(17) + +/* Root Port Registers PCI config space for root port function */ +#define CDNS_PCIE_HPA_RP_CAP_OFFSET 0xC0 + +/* Region r Outbound AXI to PCIe Address Translation Register 0 */ +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r) (0x1010 + ((r) & 0x1F) * 0x0080) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(23, 16) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK, devfn) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(31, 24) +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(bus) \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK, bus) + +/* Region r Outbound AXI to PCIe Address Translation Register 1 */ +#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r) (0x1014 + ((r) & 0x1F) * 0x0080) + +/* Region r Outbound PCIe Descriptor Register */ +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r) (0x1008 + ((r) & 0x1F) * 0x0080) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(28, 24) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x0) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x2) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x4) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x5) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x10) + +/* Region r Outbound PCIe Descriptor Register */ +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r) (0x100C + ((r) & 0x1F) * 0x0080) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK GENMASK(31, 24) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(bus) \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK, bus) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK GENMASK(23, 16) +#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(devfn) \ + FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK, devfn) + +#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r) (0x1018 + ((r) & 0x1F) * 0x0080) +#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS BIT(26) +#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN BIT(25) + +/* Region r AXI Region Base Address Register 0 */ +#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r) (0x1000 + ((r) & 0x1F) * 0x0080) +#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK) + +/* Region r AXI Region Base Address Register 1 */ +#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r) (0x1004 + ((r) & 0x1F) * 0x0080) + +/* Root Port BAR Inbound PCIe to AXI Address Translation Register */ +#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar) (((bar) * 0x0008)) +#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK) +#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar) (0x04 + ((bar) * 0x0008)) + +/* AXI link down register */ +#define CDNS_PCIE_HPA_AT_LINKDOWN 0x04 + +/* + * Physical Layer Configuration Register 0 + * This register contains the parameters required for functional setup + * of Physical Layer. + */ +#define CDNS_PCIE_HPA_PHY_LAYER_CFG0 0x0400 +#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK GENMASK(26, 24) +#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay) \ + FIELD_PREP(CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK, delay) +#define CDNS_PCIE_HPA_LINK_TRNG_EN_MASK GENMASK(27, 27) + +#define CDNS_PCIE_HPA_PHY_DBG_STS_REG0 0x0420 + +#define CDNS_PCIE_HPA_RP_MAX_IB 0x3 +#define CDNS_PCIE_HPA_MAX_OB 15 + +/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */ +#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) (((fn) * 0x0080) + ((bar) * 0x0008)) +#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) (0x4 + ((fn) * 0x0080) + ((bar) * 0x0008)) + +/* Miscellaneous offsets definitions */ +#define CDNS_PCIE_HPA_TAG_MANAGEMENT 0x0 +#define CDNS_PCIE_HPA_SLAVE_RESP 0x100 + +#define I_ROOT_PORT_REQ_ID_REG 0x141c +#define LM_HAL_SBSA_CTRL 0x1170 + +#define I_PCIE_BUS_NUMBERS (CDNS_PCIE_HPA_RP_BASE + 0x18) +#define CDNS_PCIE_EROM 0x18 +#endif /* _PCIE_CADENCE_HPA_REGS_H */ diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-hpa.c new file mode 100644 index 000000000000..f60a16938265 --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-hpa.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cadence PCIe controller driver. + * + * Copyright (c) 2024, Cadence Design Systems + * Author: Manikandan K Pillai <mpillai@cadence.com> + */ +#include <linux/kernel.h> +#include <linux/of.h> + +#include "pcie-cadence.h" + +bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie) +{ + u32 pl_reg_val; + + pl_reg_val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_PHY_DBG_STS_REG0); + if (pl_reg_val & GENMASK(0, 0)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_link_up); + +void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie) +{ + u32 delay = 0x3; + u32 ltssm_control_cap; + + /* Set the LTSSM Detect Quiet state min. delay to 2ms */ + ltssm_control_cap = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, + CDNS_PCIE_HPA_PHY_LAYER_CFG0); + ltssm_control_cap = ((ltssm_control_cap & + ~CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK) | + CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay)); + + cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG, + CDNS_PCIE_HPA_PHY_LAYER_CFG0, ltssm_control_cap); +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_detect_quiet_min_delay_set); + +void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn, + u32 r, bool is_io, + u64 cpu_addr, u64 pci_addr, size_t size) +{ + /* + * roundup_pow_of_two() returns an unsigned long, which is not suited + * for 64bit values + */ + u64 sz = 1ULL << fls64(size - 1); + int nbits = ilog2(sz); + u32 addr0, addr1, desc0, desc1, ctrl0; + + if (nbits < 8) + nbits = 8; + + /* Set the PCI address */ + addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) | + (lower_32_bits(pci_addr) & GENMASK(31, 8)); + addr1 = upper_32_bits(pci_addr); + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), addr0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), addr1); + + /* Set the PCIe header descriptor */ + if (is_io) + desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO; + else + desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM; + desc1 = 0; + ctrl0 = 0; + + /* + * Whether Bit [26] is set or not inside DESC0 register of the outbound + * PCIe descriptor, the PCI function number must be set into + * Bits [31:24] of DESC1 anyway. + * + * In Root Complex mode, the function number is always 0 but in Endpoint + * mode, the PCIe controller may support more than one function. This + * function number needs to be set properly into the outbound PCIe + * descriptor. + * + * Besides, setting Bit [26] is mandatory when in Root Complex mode: + * then the driver must provide the bus, resp. device, number in + * Bits [31:24] of DESC1, resp. Bits[23:16] of DESC0. Like the function + * number, the device number is always 0 in Root Complex mode. + * + * However when in Endpoint mode, we can clear Bit [26] of DESC0, hence + * the PCIe controller will use the captured values for the bus and + * device numbers. + */ + if (pcie->is_rc) { + /* The device and function numbers are always 0 */ + desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) | + CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0); + ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS | + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN; + } else { + /* + * Use captured values for bus and device numbers but still + * need to set the function number + */ + desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn); + } + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1); + + addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) | + (lower_32_bits(cpu_addr) & GENMASK(31, 8)); + addr1 = upper_32_bits(cpu_addr); + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0); +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region); + +void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie, + u8 busnr, u8 fn, + u32 r, u64 cpu_addr) +{ + u32 addr0, addr1, desc0, desc1, ctrl0; + + desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG; + desc1 = 0; + ctrl0 = 0; + + /* See cdns_pcie_set_outbound_region() comments above */ + if (pcie->is_rc) { + desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) | + CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0); + ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS | + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN; + } else { + desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn); + } + + addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(17) | + (lower_32_bits(cpu_addr) & GENMASK(31, 8)); + addr1 = upper_32_bits(cpu_addr); + + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), 0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), 0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1); + cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, + CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0); +} +EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region_for_normal_msg); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Cadence PCIe controller driver"); diff --git a/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h new file mode 100644 index 000000000000..857b2140c5d2 --- /dev/null +++ b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cadence PCIe controller driver. + * + * Copyright (c) 2017 Cadence + * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com> + */ +#ifndef _PCIE_CADENCE_LGA_REGS_H +#define _PCIE_CADENCE_LGA_REGS_H + +#include <linux/bitfield.h> + +/* Parameters for the waiting for link up routine */ +#define LINK_WAIT_MAX_RETRIES 10 +#define LINK_WAIT_USLEEP_MIN 90000 +#define LINK_WAIT_USLEEP_MAX 100000 + +/* Local Management Registers */ +#define CDNS_PCIE_LM_BASE 0x00100000 + +/* Vendor ID Register */ +#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044) +#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0) +#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0 +#define CDNS_PCIE_LM_ID_VENDOR(vid) \ + (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK) +#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16) +#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16 +#define CDNS_PCIE_LM_ID_SUBSYS(sub) \ + (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK) + +/* Root Port Requester ID Register */ +#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228) +#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0) +#define CDNS_PCIE_LM_RP_RID_SHIFT 0 +#define CDNS_PCIE_LM_RP_RID_(rid) \ + (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK) + +/* Endpoint Bus and Device Number Register */ +#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022C) +#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0) +#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0 +#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8) +#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8 + +/* Endpoint Function f BAR b Configuration Registers */ +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \ + (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn)) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \ + (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \ + (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008) +#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \ + (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn)) +#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \ + (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008) +#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \ + (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \ + (GENMASK(4, 0) << ((b) * 8)) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \ + (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b)) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \ + (GENMASK(7, 5) << ((b) * 8)) +#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \ + (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b)) + +/* Endpoint Function Configuration Register */ +#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02C0) + +/* Root Complex BAR Configuration Register */ +#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \ + (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \ + (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \ + (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14) +#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \ + (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK) +#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17) +#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0 +#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18) +#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19) +#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0 +#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20) +#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31) + +/* BAR control values applicable to both Endpoint Function and Root Complex */ +#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0 +#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1 +#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4 +#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5 +#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6 +#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7 + +#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \ + (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6)) +#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \ + (((aperture) - 2) << ((bar) * 8)) + +/* PTM Control Register */ +#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0DA8) +#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17) + +/* + * Endpoint Function Registers (PCI configuration space for endpoint functions) + */ +#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12)) + +#define CDNS_PCIE_EP_FUNC_MSI_CAP_OFFSET 0x90 +#define CDNS_PCIE_EP_FUNC_MSIX_CAP_OFFSET 0xB0 +#define CDNS_PCIE_EP_FUNC_DEV_CAP_OFFSET 0xC0 +#define CDNS_PCIE_EP_FUNC_SRIOV_CAP_OFFSET 0x200 + +/* Endpoint PF Registers */ +#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000) +#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8) + +/* Root Port Registers (PCI configuration space for the root port function) */ +#define CDNS_PCIE_RP_BASE 0x00200000 +#define CDNS_PCIE_RP_CAP_OFFSET 0xC0 + +/* Address Translation Registers */ +#define CDNS_PCIE_AT_BASE 0x00400000 + +/* Region r Outbound AXI to PCIe Address Translation Register 0 */ +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \ + (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1F) * 0x0020) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \ + (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20) +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \ + (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK) + +/* Region r Outbound AXI to PCIe Address Translation Register 1 */ +#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \ + (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1F) * 0x0020) + +/* Region r Outbound PCIe Descriptor Register 0 */ +#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \ + (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1F) * 0x0020) +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0) +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2 +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6 +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xA +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xB +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xC +#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xD +/* Bit 23 MUST be set in RC mode. */ +#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23) +#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24) +#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \ + (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK) + +/* Region r Outbound PCIe Descriptor Register 1 */ +#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \ + (CDNS_PCIE_AT_BASE + 0x000C + ((r) & 0x1F) * 0x0020) +#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0) +#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \ + ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK) + +/* Region r AXI Region Base Address Register 0 */ +#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \ + (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1F) * 0x0020) +#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK) + +/* Region r AXI Region Base Address Register 1 */ +#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \ + (CDNS_PCIE_AT_BASE + 0x001C + ((r) & 0x1F) * 0x0020) + +/* Root Port BAR Inbound PCIe to AXI Address Translation Register */ +#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \ + (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008) +#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0) +#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \ + (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK) +#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \ + (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008) + +/* AXI link down register */ +#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824) + +/* LTSSM Capabilities register */ +#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054) +#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1) +#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1 +#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \ + (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \ + CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK) + +#define CDNS_PCIE_RP_MAX_IB 0x3 +#define CDNS_PCIE_MAX_OB 32 + +/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */ +#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \ + (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008) +#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \ + (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008) + +/* Normal/Vendor specific message access: offset inside some outbound region */ +#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5) +#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \ + (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK) +#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8) +#define CDNS_PCIE_NORMAL_MSG_CODE(code) \ + (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK) +#define CDNS_PCIE_MSG_NO_DATA BIT(16) + +#endif /* _PCIE_CADENCE_LGA_REGS_H */ diff --git a/drivers/pci/controller/cadence/pcie-cadence-plat.c b/drivers/pci/controller/cadence/pcie-cadence-plat.c index 0456845dabb9..b067a3296dd3 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-plat.c +++ b/drivers/pci/controller/cadence/pcie-cadence-plat.c @@ -22,10 +22,6 @@ struct cdns_plat_pcie { struct cdns_pcie *pcie; }; -struct cdns_plat_pcie_of_data { - bool is_rc; -}; - static const struct of_device_id cdns_plat_pcie_of_match[]; static u64 cdns_plat_cpu_addr_fixup(struct cdns_pcie *pcie, u64 cpu_addr) @@ -177,4 +173,7 @@ static struct platform_driver cdns_plat_pcie_driver = { .probe = cdns_plat_pcie_probe, .shutdown = cdns_plat_pcie_shutdown, }; -builtin_platform_driver(cdns_plat_pcie_driver); +module_platform_driver(cdns_plat_pcie_driver); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Cadence PCIe controller platform driver"); diff --git a/drivers/pci/controller/cadence/pcie-cadence.c b/drivers/pci/controller/cadence/pcie-cadence.c index bd683d0fecb2..e6f1a4ac0fb7 100644 --- a/drivers/pci/controller/cadence/pcie-cadence.c +++ b/drivers/pci/controller/cadence/pcie-cadence.c @@ -23,6 +23,17 @@ u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap) } EXPORT_SYMBOL_GPL(cdns_pcie_find_ext_capability); +bool cdns_pcie_linkup(struct cdns_pcie *pcie) +{ + u32 pl_reg_val; + + pl_reg_val = cdns_pcie_readl(pcie, CDNS_PCIE_LM_BASE); + if (pl_reg_val & GENMASK(0, 0)) + return true; + return false; +} +EXPORT_SYMBOL_GPL(cdns_pcie_linkup); + void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie) { u32 delay = 0x3; @@ -293,6 +304,7 @@ const struct dev_pm_ops cdns_pcie_pm_ops = { NOIRQ_SYSTEM_SLEEP_PM_OPS(cdns_pcie_suspend_noirq, cdns_pcie_resume_noirq) }; +EXPORT_SYMBOL_GPL(cdns_pcie_pm_ops); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cadence PCIe controller driver"); diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h index e2a853d2c0ab..443033c607d7 100644 --- a/drivers/pci/controller/cadence/pcie-cadence.h +++ b/drivers/pci/controller/cadence/pcie-cadence.h @@ -7,211 +7,12 @@ #define _PCIE_CADENCE_H #include <linux/kernel.h> +#include <linux/module.h> #include <linux/pci.h> #include <linux/pci-epf.h> #include <linux/phy/phy.h> - -/* Parameters for the waiting for link up routine */ -#define LINK_WAIT_MAX_RETRIES 10 -#define LINK_WAIT_USLEEP_MIN 90000 -#define LINK_WAIT_USLEEP_MAX 100000 - -/* - * Local Management Registers - */ -#define CDNS_PCIE_LM_BASE 0x00100000 - -/* Vendor ID Register */ -#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044) -#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0) -#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0 -#define CDNS_PCIE_LM_ID_VENDOR(vid) \ - (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK) -#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16) -#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16 -#define CDNS_PCIE_LM_ID_SUBSYS(sub) \ - (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK) - -/* Root Port Requester ID Register */ -#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228) -#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0) -#define CDNS_PCIE_LM_RP_RID_SHIFT 0 -#define CDNS_PCIE_LM_RP_RID_(rid) \ - (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK) - -/* Endpoint Bus and Device Number Register */ -#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022c) -#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0) -#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0 -#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8) -#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8 - -/* Endpoint Function f BAR b Configuration Registers */ -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \ - (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn)) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \ - (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \ - (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008) -#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \ - (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn)) -#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \ - (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008) -#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \ - (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \ - (GENMASK(4, 0) << ((b) * 8)) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \ - (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b)) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \ - (GENMASK(7, 5) << ((b) * 8)) -#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \ - (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b)) - -/* Endpoint Function Configuration Register */ -#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02c0) - -/* Root Complex BAR Configuration Register */ -#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \ - (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \ - (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \ - (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14) -#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \ - (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK) -#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17) -#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0 -#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18) -#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19) -#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0 -#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20) -#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31) - -/* BAR control values applicable to both Endpoint Function and Root Complex */ -#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0 -#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1 -#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4 -#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5 -#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6 -#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7 - -#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \ - (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6)) -#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \ - (((aperture) - 2) << ((bar) * 8)) - -/* PTM Control Register */ -#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0da8) -#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17) - -/* - * Endpoint Function Registers (PCI configuration space for endpoint functions) - */ -#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12)) - -/* - * Endpoint PF Registers - */ -#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000) -#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8) - -/* - * Root Port Registers (PCI configuration space for the root port function) - */ -#define CDNS_PCIE_RP_BASE 0x00200000 -#define CDNS_PCIE_RP_CAP_OFFSET 0xc0 - -/* - * Address Translation Registers - */ -#define CDNS_PCIE_AT_BASE 0x00400000 - -/* Region r Outbound AXI to PCIe Address Translation Register 0 */ -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \ - (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1f) * 0x0020) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \ - (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \ - (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20) -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \ - (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK) - -/* Region r Outbound AXI to PCIe Address Translation Register 1 */ -#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \ - (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1f) * 0x0020) - -/* Region r Outbound PCIe Descriptor Register 0 */ -#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \ - (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1f) * 0x0020) -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0) -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2 -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6 -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xa -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xb -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xc -#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xd -/* Bit 23 MUST be set in RC mode. */ -#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23) -#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24) -#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \ - (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK) - -/* Region r Outbound PCIe Descriptor Register 1 */ -#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \ - (CDNS_PCIE_AT_BASE + 0x000c + ((r) & 0x1f) * 0x0020) -#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0) -#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \ - ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK) - -/* Region r AXI Region Base Address Register 0 */ -#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \ - (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1f) * 0x0020) -#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0) -#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \ - (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK) - -/* Region r AXI Region Base Address Register 1 */ -#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \ - (CDNS_PCIE_AT_BASE + 0x001c + ((r) & 0x1f) * 0x0020) - -/* Root Port BAR Inbound PCIe to AXI Address Translation Register */ -#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \ - (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008) -#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0) -#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \ - (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK) -#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \ - (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008) - -/* AXI link down register */ -#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824) - -/* LTSSM Capabilities register */ -#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054) -#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1) -#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1 -#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \ - (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \ - CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK) +#include "pcie-cadence-lga-regs.h" +#include "pcie-cadence-hpa-regs.h" enum cdns_pcie_rp_bar { RP_BAR_UNDEFINED = -1, @@ -220,42 +21,63 @@ enum cdns_pcie_rp_bar { RP_NO_BAR }; -#define CDNS_PCIE_RP_MAX_IB 0x3 -#define CDNS_PCIE_MAX_OB 32 - struct cdns_pcie_rp_ib_bar { u64 size; bool free; }; -/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */ -#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \ - (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008) -#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \ - (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008) - -/* Normal/Vendor specific message access: offset inside some outbound region */ -#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5) -#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \ - (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK) -#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8) -#define CDNS_PCIE_NORMAL_MSG_CODE(code) \ - (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK) -#define CDNS_PCIE_MSG_DATA BIT(16) - struct cdns_pcie; +struct cdns_pcie_rc; + +enum cdns_pcie_reg_bank { + REG_BANK_RP, + REG_BANK_IP_REG, + REG_BANK_IP_CFG_CTRL_REG, + REG_BANK_AXI_MASTER_COMMON, + REG_BANK_AXI_MASTER, + REG_BANK_AXI_SLAVE, + REG_BANK_AXI_HLS, + REG_BANK_AXI_RAS, + REG_BANK_AXI_DTI, + REG_BANKS_MAX, +}; struct cdns_pcie_ops { - int (*start_link)(struct cdns_pcie *pcie); - void (*stop_link)(struct cdns_pcie *pcie); - bool (*link_up)(struct cdns_pcie *pcie); + int (*start_link)(struct cdns_pcie *pcie); + void (*stop_link)(struct cdns_pcie *pcie); + bool (*link_up)(struct cdns_pcie *pcie); u64 (*cpu_addr_fixup)(struct cdns_pcie *pcie, u64 cpu_addr); }; /** + * struct cdns_plat_pcie_of_data - Register bank offset for a platform + * @is_rc: controller is a RC + * @ip_reg_bank_offset: ip register bank start offset + * @ip_cfg_ctrl_reg_offset: ip config control register start offset + * @axi_mstr_common_offset: AXI master common register start offset + * @axi_slave_offset: AXI slave start offset + * @axi_master_offset: AXI master start offset + * @axi_hls_offset: AXI HLS offset start + * @axi_ras_offset: AXI RAS offset + * @axi_dti_offset: AXI DTI offset + */ +struct cdns_plat_pcie_of_data { + u32 is_rc:1; + u32 ip_reg_bank_offset; + u32 ip_cfg_ctrl_reg_offset; + u32 axi_mstr_common_offset; + u32 axi_slave_offset; + u32 axi_master_offset; + u32 axi_hls_offset; + u32 axi_ras_offset; + u32 axi_dti_offset; +}; + +/** * struct cdns_pcie - private data for Cadence PCIe controller drivers * @reg_base: IO mapped register base * @mem_res: start/end offsets in the physical system memory to map PCI accesses + * @msg_res: Region for send message to map PCI accesses * @dev: PCIe controller * @is_rc: tell whether the PCIe controller mode is Root Complex or Endpoint. * @phy_count: number of supported PHY devices @@ -263,16 +85,19 @@ struct cdns_pcie_ops { * @link: list of pointers to corresponding device link representations * @ops: Platform-specific ops to control various inputs from Cadence PCIe * wrapper + * @cdns_pcie_reg_offsets: Register bank offsets for different SoC */ struct cdns_pcie { - void __iomem *reg_base; - struct resource *mem_res; - struct device *dev; - bool is_rc; - int phy_count; - struct phy **phy; - struct device_link **link; - const struct cdns_pcie_ops *ops; + void __iomem *reg_base; + struct resource *mem_res; + struct resource *msg_res; + struct device *dev; + bool is_rc; + int phy_count; + struct phy **phy; + struct device_link **link; + const struct cdns_pcie_ops *ops; + const struct cdns_plat_pcie_of_data *cdns_pcie_reg_offsets; }; /** @@ -288,6 +113,8 @@ struct cdns_pcie { * available * @quirk_retrain_flag: Retrain link as quirk for PCIe Gen2 * @quirk_detect_quiet_flag: LTSSM Detect Quiet min delay set as quirk + * @ecam_supported: Whether the ECAM is supported + * @no_inbound_map: Whether inbound mapping is supported */ struct cdns_pcie_rc { struct cdns_pcie pcie; @@ -298,6 +125,8 @@ struct cdns_pcie_rc { bool avail_ib_bar[CDNS_PCIE_RP_MAX_IB]; unsigned int quirk_retrain_flag:1; unsigned int quirk_detect_quiet_flag:1; + unsigned int ecam_supported:1; + unsigned int no_inbound_map:1; }; /** @@ -350,6 +179,43 @@ struct cdns_pcie_ep { unsigned int quirk_disable_flr:1; }; +static inline u32 cdns_reg_bank_to_off(struct cdns_pcie *pcie, enum cdns_pcie_reg_bank bank) +{ + u32 offset = 0x0; + + switch (bank) { + case REG_BANK_RP: + offset = 0; + break; + case REG_BANK_IP_REG: + offset = pcie->cdns_pcie_reg_offsets->ip_reg_bank_offset; + break; + case REG_BANK_IP_CFG_CTRL_REG: + offset = pcie->cdns_pcie_reg_offsets->ip_cfg_ctrl_reg_offset; + break; + case REG_BANK_AXI_MASTER_COMMON: + offset = pcie->cdns_pcie_reg_offsets->axi_mstr_common_offset; + break; + case REG_BANK_AXI_MASTER: + offset = pcie->cdns_pcie_reg_offsets->axi_master_offset; + break; + case REG_BANK_AXI_SLAVE: + offset = pcie->cdns_pcie_reg_offsets->axi_slave_offset; + break; + case REG_BANK_AXI_HLS: + offset = pcie->cdns_pcie_reg_offsets->axi_hls_offset; + break; + case REG_BANK_AXI_RAS: + offset = pcie->cdns_pcie_reg_offsets->axi_ras_offset; + break; + case REG_BANK_AXI_DTI: + offset = pcie->cdns_pcie_reg_offsets->axi_dti_offset; + break; + default: + break; + } + return offset; +} /* Register access */ static inline void cdns_pcie_writel(struct cdns_pcie *pcie, u32 reg, u32 value) @@ -362,6 +228,27 @@ static inline u32 cdns_pcie_readl(struct cdns_pcie *pcie, u32 reg) return readl(pcie->reg_base + reg); } +static inline void cdns_pcie_hpa_writel(struct cdns_pcie *pcie, + enum cdns_pcie_reg_bank bank, + u32 reg, + u32 value) +{ + u32 offset = cdns_reg_bank_to_off(pcie, bank); + + reg += offset; + writel(value, pcie->reg_base + reg); +} + +static inline u32 cdns_pcie_hpa_readl(struct cdns_pcie *pcie, + enum cdns_pcie_reg_bank bank, + u32 reg) +{ + u32 offset = cdns_reg_bank_to_off(pcie, bank); + + reg += offset; + return readl(pcie->reg_base + reg); +} + static inline u16 cdns_pcie_readw(struct cdns_pcie *pcie, u32 reg) { return readw(pcie->reg_base + reg); @@ -457,6 +344,29 @@ static inline u16 cdns_pcie_rp_readw(struct cdns_pcie *pcie, u32 reg) return cdns_pcie_read_sz(addr, 0x2); } +static inline void cdns_pcie_hpa_rp_writeb(struct cdns_pcie *pcie, + u32 reg, u8 value) +{ + void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg; + + cdns_pcie_write_sz(addr, 0x1, value); +} + +static inline void cdns_pcie_hpa_rp_writew(struct cdns_pcie *pcie, + u32 reg, u16 value) +{ + void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg; + + cdns_pcie_write_sz(addr, 0x2, value); +} + +static inline u16 cdns_pcie_hpa_rp_readw(struct cdns_pcie *pcie, u32 reg) +{ + void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg; + + return cdns_pcie_read_sz(addr, 0x2); +} + /* Endpoint Function register access */ static inline void cdns_pcie_ep_fn_writeb(struct cdns_pcie *pcie, u8 fn, u32 reg, u8 value) @@ -521,6 +431,7 @@ int cdns_pcie_host_setup(struct cdns_pcie_rc *rc); void cdns_pcie_host_disable(struct cdns_pcie_rc *rc); void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int devfn, int where); +int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc); #else static inline int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc) { @@ -537,6 +448,11 @@ static inline int cdns_pcie_host_setup(struct cdns_pcie_rc *rc) return 0; } +static inline int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc) +{ + return 0; +} + static inline void cdns_pcie_host_disable(struct cdns_pcie_rc *rc) { } @@ -551,6 +467,7 @@ static inline void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int d #if IS_ENABLED(CONFIG_PCIE_CADENCE_EP) int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep); void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep); +int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep); #else static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep) { @@ -560,10 +477,17 @@ static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep) static inline void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep) { } + +static inline int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep) +{ + return 0; +} + #endif -u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap); -u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap); +u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap); +u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap); +bool cdns_pcie_linkup(struct cdns_pcie *pcie); void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie); @@ -577,8 +501,23 @@ void cdns_pcie_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie, void cdns_pcie_reset_outbound_region(struct cdns_pcie *pcie, u32 r); void cdns_pcie_disable_phy(struct cdns_pcie *pcie); -int cdns_pcie_enable_phy(struct cdns_pcie *pcie); -int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie); +int cdns_pcie_enable_phy(struct cdns_pcie *pcie); +int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie); +void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie); +void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn, + u32 r, bool is_io, + u64 cpu_addr, u64 pci_addr, size_t size); +void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie, + u8 busnr, u8 fn, + u32 r, u64 cpu_addr); +int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc); +void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn, + int where); +int cdns_pcie_hpa_host_start_link(struct cdns_pcie_rc *rc); +int cdns_pcie_hpa_start_link(struct cdns_pcie *pcie); +void cdns_pcie_hpa_stop_link(struct cdns_pcie *pcie); +bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie); + extern const struct dev_pm_ops cdns_pcie_pm_ops; #endif /* _PCIE_CADENCE_H */ diff --git a/drivers/pci/controller/cadence/pcie-sg2042.c b/drivers/pci/controller/cadence/pcie-sg2042.c index a077b28d4894..0c50c74d03ee 100644 --- a/drivers/pci/controller/cadence/pcie-sg2042.c +++ b/drivers/pci/controller/cadence/pcie-sg2042.c @@ -74,15 +74,12 @@ static int sg2042_pcie_probe(struct platform_device *pdev) static void sg2042_pcie_remove(struct platform_device *pdev) { struct cdns_pcie *pcie = platform_get_drvdata(pdev); - struct device *dev = &pdev->dev; struct cdns_pcie_rc *rc; rc = container_of(pcie, struct cdns_pcie_rc, pcie); cdns_pcie_host_disable(rc); cdns_pcie_disable_phy(pcie); - - pm_runtime_disable(dev); } static int sg2042_pcie_suspend_noirq(struct device *dev) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 349d4657393c..519b59422b47 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -256,6 +256,16 @@ config PCIE_TEGRA194_EP in order to enable device-specific features PCIE_TEGRA194_EP must be selected. This uses the DesignWare core. +config PCIE_NXP_S32G + bool "NXP S32G PCIe controller (host mode)" + depends on ARCH_S32 || COMPILE_TEST + select PCIE_DW_HOST + help + Enable support for the PCIe controller in NXP S32G based boards to + work in Host mode. The controller is based on DesignWare IP and + can work either as RC or EP. In order to enable host-specific + features PCIE_NXP_S32G must be selected. + config PCIE_DW_PLAT bool @@ -416,6 +426,19 @@ config PCIE_SOPHGO_DW Say Y here if you want PCIe host controller support on Sophgo SoCs. +config PCIE_SPACEMIT_K1 + tristate "SpacemiT K1 PCIe controller (host mode)" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on HAS_IOMEM + select PCIE_DW_HOST + select PCI_PWRCTRL_SLOT + default ARCH_SPACEMIT + help + Enables support for the DesignWare based PCIe controller in + the SpacemiT K1 SoC operating in host mode. Three controllers + are available on the K1 SoC; the first of these shares a PHY + with a USB 3.0 host controller (one or the other can be used). + config PCIE_SPEAR13XX bool "STMicroelectronics SPEAr PCIe controller" depends on ARCH_SPEAR13XX || COMPILE_TEST @@ -482,15 +505,21 @@ config PCI_DRA7XX_EP to enable device-specific features PCI_DRA7XX_EP must be selected. This uses the DesignWare core. +# ARM32 platforms use hook_fault_code() and cannot support loadable module. config PCI_KEYSTONE bool +# On non-ARM32 platforms, loadable module can be supported. +config PCI_KEYSTONE_TRISTATE + tristate + config PCI_KEYSTONE_HOST - bool "TI Keystone PCIe controller (host mode)" + tristate "TI Keystone PCIe controller (host mode)" depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST depends on PCI_MSI select PCIE_DW_HOST - select PCI_KEYSTONE + select PCI_KEYSTONE if ARM + select PCI_KEYSTONE_TRISTATE if !ARM help Enables support for the PCIe controller in the Keystone SoC to work in host mode. The PCI controller on Keystone is based on @@ -498,11 +527,12 @@ config PCI_KEYSTONE_HOST DesignWare core functions to implement the driver. config PCI_KEYSTONE_EP - bool "TI Keystone PCIe controller (endpoint mode)" + tristate "TI Keystone PCIe controller (endpoint mode)" depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST depends on PCI_ENDPOINT select PCIE_DW_EP - select PCI_KEYSTONE + select PCI_KEYSTONE if ARM + select PCI_KEYSTONE_TRISTATE if !ARM help Enables support for the PCIe controller in the Keystone SoC to work in endpoint mode. The PCI controller on Keystone is based diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index 7ae28f3b0fb3..67ba59c02038 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -10,8 +10,12 @@ obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o +obj-$(CONFIG_PCIE_NXP_S32G) += pcie-nxp-s32g.o obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o +# ARM32 platforms use hook_fault_code() and cannot support loadable module. obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o +# On non-ARM32 platforms, loadable module can be supported. +obj-$(CONFIG_PCI_KEYSTONE_TRISTATE) += pci-keystone.o obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM_COMMON) += pcie-qcom-common.o @@ -31,6 +35,7 @@ obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o obj-$(CONFIG_PCIE_VISCONTI_HOST) += pcie-visconti.o obj-$(CONFIG_PCIE_RCAR_GEN4) += pcie-rcar-gen4.o +obj-$(CONFIG_PCIE_SPACEMIT_K1) += pcie-spacemit-k1.o obj-$(CONFIG_PCIE_STM32_HOST) += pcie-stm32.o obj-$(CONFIG_PCIE_STM32_EP) += pcie-stm32-ep.o diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c index eb00aa380722..f86d9111f863 100644 --- a/drivers/pci/controller/dwc/pci-keystone.c +++ b/drivers/pci/controller/dwc/pci-keystone.c @@ -17,6 +17,7 @@ #include <linux/irqchip/chained_irq.h> #include <linux/irqdomain.h> #include <linux/mfd/syscon.h> +#include <linux/module.h> #include <linux/msi.h> #include <linux/of.h> #include <linux/of_irq.h> @@ -777,29 +778,7 @@ err: return ret; } -#ifdef CONFIG_ARM -/* - * When a PCI device does not exist during config cycles, keystone host - * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE). - * This handler always returns 0 for this kind of fault. - */ -static int ks_pcie_fault(unsigned long addr, unsigned int fsr, - struct pt_regs *regs) -{ - unsigned long instr = *(unsigned long *) instruction_pointer(regs); - - if ((instr & 0x0e100090) == 0x00100090) { - int reg = (instr >> 12) & 15; - - regs->uregs[reg] = -1; - regs->ARM_pc += 4; - } - - return 0; -} -#endif - -static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie) +static int ks_pcie_init_id(struct keystone_pcie *ks_pcie) { int ret; unsigned int id; @@ -831,7 +810,7 @@ static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie) return 0; } -static int __init ks_pcie_host_init(struct dw_pcie_rp *pp) +static int ks_pcie_host_init(struct dw_pcie_rp *pp) { struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct keystone_pcie *ks_pcie = to_keystone_pcie(pci); @@ -861,15 +840,6 @@ static int __init ks_pcie_host_init(struct dw_pcie_rp *pp) if (ret < 0) return ret; -#ifdef CONFIG_ARM - /* - * PCIe access errors that result into OCP errors are caught by ARM as - * "External aborts" - */ - hook_fault_code(17, ks_pcie_fault, SIGBUS, 0, - "Asynchronous external abort"); -#endif - return 0; } @@ -1134,6 +1104,7 @@ static const struct of_device_id ks_pcie_of_match[] = { }, { }, }; +MODULE_DEVICE_TABLE(of, ks_pcie_of_match); static int ks_pcie_probe(struct platform_device *pdev) { @@ -1337,6 +1308,8 @@ static int ks_pcie_probe(struct platform_device *pdev) break; default: dev_err(dev, "INVALID device type %d\n", mode); + ret = -EINVAL; + goto err_get_sync; } ks_pcie_enable_error_irq(ks_pcie); @@ -1379,4 +1352,45 @@ static struct platform_driver ks_pcie_driver = { .of_match_table = ks_pcie_of_match, }, }; + +#ifdef CONFIG_ARM +/* + * When a PCI device does not exist during config cycles, keystone host + * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE). + * This handler always returns 0 for this kind of fault. + */ +static int ks_pcie_fault(unsigned long addr, unsigned int fsr, + struct pt_regs *regs) +{ + unsigned long instr = *(unsigned long *)instruction_pointer(regs); + + if ((instr & 0x0e100090) == 0x00100090) { + int reg = (instr >> 12) & 15; + + regs->uregs[reg] = -1; + regs->ARM_pc += 4; + } + + return 0; +} + +static int __init ks_pcie_init(void) +{ + /* + * PCIe access errors that result into OCP errors are caught by ARM as + * "External aborts" + */ + if (of_find_matching_node(NULL, ks_pcie_of_match)) + hook_fault_code(17, ks_pcie_fault, SIGBUS, 0, + "Asynchronous external abort"); + + return platform_driver_register(&ks_pcie_driver); +} +device_initcall(ks_pcie_init); +#else builtin_platform_driver(ks_pcie_driver); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PCIe controller driver for Texas Instruments Keystone SoCs"); +MODULE_AUTHOR("Murali Karicheri <m-karicheri2@ti.com>"); diff --git a/drivers/pci/controller/dwc/pci-meson.c b/drivers/pci/controller/dwc/pci-meson.c index 787469d1b396..54b6a4196f17 100644 --- a/drivers/pci/controller/dwc/pci-meson.c +++ b/drivers/pci/controller/dwc/pci-meson.c @@ -108,10 +108,22 @@ static int meson_pcie_get_mems(struct platform_device *pdev, struct meson_pcie *mp) { struct dw_pcie *pci = &mp->pci; + struct resource *res; - pci->dbi_base = devm_platform_ioremap_resource_byname(pdev, "elbi"); - if (IS_ERR(pci->dbi_base)) - return PTR_ERR(pci->dbi_base); + /* + * For the broken DTs that supply 'dbi' as 'elbi', parse the 'elbi' + * region and assign it to both 'pci->elbi_base' and 'pci->dbi_space' so + * that the DWC core can skip parsing both regions. + */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi"); + if (res) { + pci->elbi_base = devm_pci_remap_cfg_resource(pci->dev, res); + if (IS_ERR(pci->elbi_base)) + return PTR_ERR(pci->elbi_base); + + pci->dbi_base = pci->elbi_base; + pci->dbi_phys_addr = res->start; + } mp->cfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg"); if (IS_ERR(mp->cfg_base)) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 7f2112c2fb21..19571ac2b961 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -797,6 +797,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, return 0; } +EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msix_irq); /** * dw_pcie_ep_cleanup - Cleanup DWC EP resources after fundamental reset diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index e92513c5bda5..372207c33a85 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -233,6 +233,7 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp) return 0; } +EXPORT_SYMBOL_GPL(dw_pcie_allocate_domains); void dw_pcie_free_msi(struct dw_pcie_rp *pp) { @@ -856,10 +857,19 @@ static void __iomem *dw_pcie_ecam_conf_map_bus(struct pci_bus *bus, unsigned int return pci->dbi_base + where; } +static int dw_pcie_op_assert_perst(struct pci_bus *bus, bool assert) +{ + struct dw_pcie_rp *pp = bus->sysdata; + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + return dw_pcie_assert_perst(pci, assert); +} + static struct pci_ops dw_pcie_ops = { .map_bus = dw_pcie_own_conf_map_bus, .read = pci_generic_config_read, .write = pci_generic_config_write, + .assert_perst = dw_pcie_op_assert_perst, }; static struct pci_ops dw_pcie_ecam_ops = { @@ -1080,6 +1090,8 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp) PCI_COMMAND_MASTER | PCI_COMMAND_SERR; dw_pcie_writel_dbi(pci, PCI_COMMAND, val); + dw_pcie_hide_unsupported_l1ss(pci); + dw_pcie_config_presets(pp); /* * If the platform provides its own child bus config accesses, it means diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index c644216995f6..75fc8b767fcc 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -168,11 +168,13 @@ int dw_pcie_get_resources(struct dw_pcie *pci) } /* ELBI is an optional resource */ - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi"); - if (res) { - pci->elbi_base = devm_ioremap_resource(pci->dev, res); - if (IS_ERR(pci->elbi_base)) - return PTR_ERR(pci->elbi_base); + if (!pci->elbi_base) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi"); + if (res) { + pci->elbi_base = devm_ioremap_resource(pci->dev, res); + if (IS_ERR(pci->elbi_base)) + return PTR_ERR(pci->elbi_base); + } } /* LLDD is supposed to manually switch the clocks and resets state */ @@ -1081,6 +1083,30 @@ void dw_pcie_edma_remove(struct dw_pcie *pci) dw_edma_remove(&pci->edma); } +void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci) +{ + u16 l1ss; + u32 l1ss_cap; + + if (pci->l1ss_support) + return; + + l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); + if (!l1ss) + return; + + /* + * Unless the driver claims "l1ss_support", don't advertise L1 PM + * Substates because they require CLKREQ# and possibly other + * device-specific configuration. + */ + l1ss_cap = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP); + l1ss_cap &= ~(PCI_L1SS_CAP_PCIPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_1 | + PCI_L1SS_CAP_PCIPM_L1_2 | PCI_L1SS_CAP_ASPM_L1_2 | + PCI_L1SS_CAP_L1_PM_SS); + dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, l1ss_cap); +} + void dw_pcie_setup(struct dw_pcie *pci) { u32 val; diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index e995f692a1ec..31685951a080 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -97,7 +97,7 @@ #define PORT_LANE_SKEW_INSERT_MASK GENMASK(23, 0) #define PCIE_PORT_DEBUG0 0x728 -#define PORT_LOGIC_LTSSM_STATE_MASK 0x1f +#define PORT_LOGIC_LTSSM_STATE_MASK 0x3f #define PORT_LOGIC_LTSSM_STATE_L0 0x11 #define PCIE_PORT_DEBUG1 0x72C #define PCIE_PORT_DEBUG1_LINK_UP BIT(4) @@ -121,6 +121,7 @@ #define GEN3_RELATED_OFF 0x890 #define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0) +#define GEN3_RELATED_OFF_EQ_PHASE_2_3 BIT(9) #define GEN3_RELATED_OFF_RXEQ_RGRDLESS_RXTS BIT(13) #define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16) #define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24 @@ -138,6 +139,13 @@ #define GEN3_EQ_FMDC_MAX_PRE_CURSOR_DELTA GENMASK(13, 10) #define GEN3_EQ_FMDC_MAX_POST_CURSOR_DELTA GENMASK(17, 14) +#define COHERENCY_CONTROL_1_OFF 0x8E0 +#define CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK GENMASK(31, 2) +#define CFG_MEMTYPE_VALUE BIT(0) + +#define COHERENCY_CONTROL_2_OFF 0x8E4 +#define COHERENCY_CONTROL_3_OFF 0x8E8 + #define PCIE_PORT_MULTI_LANE_CTRL 0x8C0 #define PORT_MLTI_UPCFG_SUPPORT BIT(7) @@ -485,6 +493,7 @@ struct dw_pcie_ops { enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie); int (*start_link)(struct dw_pcie *pcie); void (*stop_link)(struct dw_pcie *pcie); + int (*assert_perst)(struct dw_pcie *pcie, bool assert); }; struct debugfs_info { @@ -516,6 +525,7 @@ struct dw_pcie { int max_link_speed; u8 n_fts[2]; struct dw_edma_chip edma; + bool l1ss_support; /* L1 PM Substates support */ struct clk_bulk_data app_clks[DW_PCIE_NUM_APP_CLKS]; struct clk_bulk_data core_clks[DW_PCIE_NUM_CORE_CLKS]; struct reset_control_bulk_data app_rsts[DW_PCIE_NUM_APP_RSTS]; @@ -573,6 +583,7 @@ int dw_pcie_prog_ep_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, int type, u64 parent_bus_addr, u8 bar, size_t size); void dw_pcie_disable_atu(struct dw_pcie *pci, u32 dir, int index); +void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci); void dw_pcie_setup(struct dw_pcie *pci); void dw_pcie_iatu_detect(struct dw_pcie *pci); int dw_pcie_edma_detect(struct dw_pcie *pci); @@ -787,6 +798,14 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci) pci->ops->stop_link(pci); } +static inline int dw_pcie_assert_perst(struct dw_pcie *pci, bool assert) +{ + if (pci->ops && pci->ops->assert_perst) + return pci->ops->assert_perst(pci, assert); + + return 0; +} + static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci) { u32 val; diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c index 3e2752c7dd09..f8605fe61a41 100644 --- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c +++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c @@ -62,6 +62,12 @@ /* Interrupt Mask Register Related to Miscellaneous Operation */ #define PCIE_CLIENT_INTR_MASK_MISC 0x24 +/* Power Management Control Register */ +#define PCIE_CLIENT_POWER_CON 0x2c +#define PCIE_CLKREQ_READY FIELD_PREP_WM16(BIT(0), 1) +#define PCIE_CLKREQ_NOT_READY FIELD_PREP_WM16(BIT(0), 0) +#define PCIE_CLKREQ_PULL_DOWN FIELD_PREP_WM16(GENMASK(13, 12), 1) + /* Hot Reset Control Register */ #define PCIE_CLIENT_HOT_RESET_CTRL 0x180 #define PCIE_LTSSM_APP_DLY2_EN BIT(1) @@ -82,9 +88,9 @@ struct rockchip_pcie { unsigned int clk_cnt; struct reset_control *rst; struct gpio_desc *rst_gpio; - struct regulator *vpcie3v3; struct irq_domain *irq_domain; const struct rockchip_pcie_of_data *data; + bool supports_clkreq; }; struct rockchip_pcie_of_data { @@ -200,6 +206,35 @@ static bool rockchip_pcie_link_up(struct dw_pcie *pci) return FIELD_GET(PCIE_LINKUP_MASK, val) == PCIE_LINKUP; } +/* + * See e.g. section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0 for the steps + * needed to support L1 substates. Currently, just enable L1 substates for RC + * mode if CLKREQ# is properly connected and supports-clkreq is present in DT. + * For EP mode, there are more things should be done to actually save power in + * L1 substates, so disable L1 substates until there is proper support. + */ +static void rockchip_pcie_configure_l1ss(struct dw_pcie *pci) +{ + struct rockchip_pcie *rockchip = to_rockchip_pcie(pci); + + /* Enable L1 substates if CLKREQ# is properly connected */ + if (rockchip->supports_clkreq) { + rockchip_pcie_writel_apb(rockchip, PCIE_CLKREQ_READY, + PCIE_CLIENT_POWER_CON); + pci->l1ss_support = true; + return; + } + + /* + * Otherwise, assert CLKREQ# unconditionally. Since + * pci->l1ss_support is not set, the DWC core will prevent L1 + * Substates support from being advertised. + */ + rockchip_pcie_writel_apb(rockchip, + PCIE_CLKREQ_PULL_DOWN | PCIE_CLKREQ_NOT_READY, + PCIE_CLIENT_POWER_CON); +} + static void rockchip_pcie_enable_l0s(struct dw_pcie *pci) { u32 cap, lnkcap; @@ -264,6 +299,7 @@ static int rockchip_pcie_host_init(struct dw_pcie_rp *pp) irq_set_chained_handler_and_data(irq, rockchip_pcie_intx_handler, rockchip); + rockchip_pcie_configure_l1ss(pci); rockchip_pcie_enable_l0s(pci); return 0; @@ -412,6 +448,9 @@ static int rockchip_pcie_resource_get(struct platform_device *pdev, return dev_err_probe(&pdev->dev, PTR_ERR(rockchip->rst), "failed to get reset lines\n"); + rockchip->supports_clkreq = of_property_read_bool(pdev->dev.of_node, + "supports-clkreq"); + return 0; } @@ -652,22 +691,15 @@ static int rockchip_pcie_probe(struct platform_device *pdev) return ret; /* DON'T MOVE ME: must be enable before PHY init */ - rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3"); - if (IS_ERR(rockchip->vpcie3v3)) { - if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV) - return dev_err_probe(dev, PTR_ERR(rockchip->vpcie3v3), - "failed to get vpcie3v3 regulator\n"); - rockchip->vpcie3v3 = NULL; - } else { - ret = regulator_enable(rockchip->vpcie3v3); - if (ret) - return dev_err_probe(dev, ret, - "failed to enable vpcie3v3 regulator\n"); - } + ret = devm_regulator_get_enable_optional(dev, "vpcie3v3"); + if (ret < 0 && ret != -ENODEV) + return dev_err_probe(dev, ret, + "failed to enable vpcie3v3 regulator\n"); ret = rockchip_pcie_phy_init(rockchip); if (ret) - goto disable_regulator; + return dev_err_probe(dev, ret, + "failed to initialize the phy\n"); ret = reset_control_deassert(rockchip->rst); if (ret) @@ -700,9 +732,6 @@ deinit_clk: clk_bulk_disable_unprepare(rockchip->clk_cnt, rockchip->clks); deinit_phy: rockchip_pcie_phy_deinit(rockchip); -disable_regulator: - if (rockchip->vpcie3v3) - regulator_disable(rockchip->vpcie3v3); return ret; } diff --git a/drivers/pci/controller/dwc/pcie-nxp-s32g.c b/drivers/pci/controller/dwc/pcie-nxp-s32g.c new file mode 100644 index 000000000000..47745749f75c --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-nxp-s32g.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe host controller driver for NXP S32G SoCs + * + * Copyright 2019-2025 NXP + */ + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of_device.h> +#include <linux/of_address.h> +#include <linux/pci.h> +#include <linux/phy/phy.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> +#include <linux/sizes.h> +#include <linux/types.h> + +#include "pcie-designware.h" + +/* PCIe controller Sub-System */ + +/* PCIe controller 0 General Control 1 */ +#define PCIE_S32G_PE0_GEN_CTRL_1 0x50 +#define DEVICE_TYPE_MASK GENMASK(3, 0) +#define SRIS_MODE BIT(8) + +/* PCIe controller 0 General Control 3 */ +#define PCIE_S32G_PE0_GEN_CTRL_3 0x58 +#define LTSSM_EN BIT(0) + +/* PCIe Controller 0 Interrupt Status */ +#define PCIE_S32G_PE0_INT_STS 0xE8 +#define HP_INT_STS BIT(6) + +/* Boundary between peripheral space and physical memory space */ +#define S32G_MEMORY_BOUNDARY_ADDR 0x80000000 + +struct s32g_pcie_port { + struct list_head list; + struct phy *phy; +}; + +struct s32g_pcie { + struct dw_pcie pci; + void __iomem *ctrl_base; + struct list_head ports; +}; + +#define to_s32g_from_dw_pcie(x) \ + container_of(x, struct s32g_pcie, pci) + +static void s32g_pcie_writel_ctrl(struct s32g_pcie *s32g_pp, u32 reg, u32 val) +{ + writel(val, s32g_pp->ctrl_base + reg); +} + +static u32 s32g_pcie_readl_ctrl(struct s32g_pcie *s32g_pp, u32 reg) +{ + return readl(s32g_pp->ctrl_base + reg); +} + +static void s32g_pcie_enable_ltssm(struct s32g_pcie *s32g_pp) +{ + u32 reg; + + reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3); + reg |= LTSSM_EN; + s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg); +} + +static void s32g_pcie_disable_ltssm(struct s32g_pcie *s32g_pp) +{ + u32 reg; + + reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3); + reg &= ~LTSSM_EN; + s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg); +} + +static int s32g_pcie_start_link(struct dw_pcie *pci) +{ + struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci); + + s32g_pcie_enable_ltssm(s32g_pp); + + return 0; +} + +static void s32g_pcie_stop_link(struct dw_pcie *pci) +{ + struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci); + + s32g_pcie_disable_ltssm(s32g_pp); +} + +static struct dw_pcie_ops s32g_pcie_ops = { + .start_link = s32g_pcie_start_link, + .stop_link = s32g_pcie_stop_link, +}; + +/* Configure the AMBA AXI Coherency Extensions (ACE) interface */ +static void s32g_pcie_reset_mstr_ace(struct dw_pcie *pci) +{ + u32 ddr_base_low = lower_32_bits(S32G_MEMORY_BOUNDARY_ADDR); + u32 ddr_base_high = upper_32_bits(S32G_MEMORY_BOUNDARY_ADDR); + + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_3_OFF, 0x0); + + /* + * Ncore is a cache-coherent interconnect module that enables the + * integration of heterogeneous coherent and non-coherent agents in + * the chip. Ncore transactions to peripheral should be non-coherent + * or it might drop them. + * + * One example where this is needed are PCIe MSIs, which use NoSnoop=0 + * and might end up routed to Ncore. PCIe coherent traffic (e.g. MSIs) + * that targets peripheral space will be dropped by Ncore because + * peripherals on S32G are not coherent as slaves. We add a hard + * boundary in the PCIe controller coherency control registers to + * separate physical memory space from peripheral space. + * + * Define the start of DDR as seen by Linux as this boundary between + * "memory" and "peripherals", with peripherals being below. + */ + dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_1_OFF, + (ddr_base_low & CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK)); + dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_2_OFF, ddr_base_high); + dw_pcie_dbi_ro_wr_dis(pci); +} + +static int s32g_init_pcie_controller(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci); + u32 val; + + /* Set RP mode */ + val = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1); + val &= ~DEVICE_TYPE_MASK; + val |= FIELD_PREP(DEVICE_TYPE_MASK, PCI_EXP_TYPE_ROOT_PORT); + + /* Use default CRNS */ + val &= ~SRIS_MODE; + + s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1, val); + + /* + * Make sure we use the coherency defaults (just in case the settings + * have been changed from their reset values) + */ + s32g_pcie_reset_mstr_ace(pci); + + dw_pcie_dbi_ro_wr_en(pci); + + val = dw_pcie_readl_dbi(pci, PCIE_PORT_FORCE); + val |= PORT_FORCE_DO_DESKEW_FOR_SRIS; + dw_pcie_writel_dbi(pci, PCIE_PORT_FORCE, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val |= GEN3_RELATED_OFF_EQ_PHASE_2_3; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + dw_pcie_dbi_ro_wr_dis(pci); + + return 0; +} + +static const struct dw_pcie_host_ops s32g_pcie_host_ops = { + .init = s32g_init_pcie_controller, +}; + +static int s32g_init_pcie_phy(struct s32g_pcie *s32g_pp) +{ + struct dw_pcie *pci = &s32g_pp->pci; + struct device *dev = pci->dev; + struct s32g_pcie_port *port, *tmp; + int ret; + + list_for_each_entry(port, &s32g_pp->ports, list) { + ret = phy_init(port->phy); + if (ret) { + dev_err(dev, "Failed to init serdes PHY\n"); + goto err_phy_revert; + } + + ret = phy_set_mode_ext(port->phy, PHY_MODE_PCIE, 0); + if (ret) { + dev_err(dev, "Failed to set mode on serdes PHY\n"); + goto err_phy_exit; + } + + ret = phy_power_on(port->phy); + if (ret) { + dev_err(dev, "Failed to power on serdes PHY\n"); + goto err_phy_exit; + } + } + + return 0; + +err_phy_exit: + phy_exit(port->phy); + +err_phy_revert: + list_for_each_entry_continue_reverse(port, &s32g_pp->ports, list) { + phy_power_off(port->phy); + phy_exit(port->phy); + } + + list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list) + list_del(&port->list); + + return ret; +} + +static void s32g_deinit_pcie_phy(struct s32g_pcie *s32g_pp) +{ + struct s32g_pcie_port *port, *tmp; + + list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list) { + phy_power_off(port->phy); + phy_exit(port->phy); + list_del(&port->list); + } +} + +static int s32g_pcie_init(struct device *dev, struct s32g_pcie *s32g_pp) +{ + s32g_pcie_disable_ltssm(s32g_pp); + + return s32g_init_pcie_phy(s32g_pp); +} + +static void s32g_pcie_deinit(struct s32g_pcie *s32g_pp) +{ + s32g_pcie_disable_ltssm(s32g_pp); + + s32g_deinit_pcie_phy(s32g_pp); +} + +static int s32g_pcie_parse_port(struct s32g_pcie *s32g_pp, struct device_node *node) +{ + struct device *dev = s32g_pp->pci.dev; + struct s32g_pcie_port *port; + int num_lanes; + + port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + port->phy = devm_of_phy_get(dev, node, NULL); + if (IS_ERR(port->phy)) + return dev_err_probe(dev, PTR_ERR(port->phy), + "Failed to get serdes PHY\n"); + + INIT_LIST_HEAD(&port->list); + list_add_tail(&port->list, &s32g_pp->ports); + + /* + * The DWC core initialization code cannot yet parse the num-lanes + * attribute in the Root Port node. The S32G only supports one Root + * Port for now so its driver can parse the node and set the num_lanes + * field of struct dwc_pcie before calling dw_pcie_host_init(). + */ + if (!of_property_read_u32(node, "num-lanes", &num_lanes)) + s32g_pp->pci.num_lanes = num_lanes; + + return 0; +} + +static int s32g_pcie_parse_ports(struct device *dev, struct s32g_pcie *s32g_pp) +{ + struct s32g_pcie_port *port, *tmp; + int ret = -ENOENT; + + for_each_available_child_of_node_scoped(dev->of_node, of_port) { + if (!of_node_is_type(of_port, "pci")) + continue; + + ret = s32g_pcie_parse_port(s32g_pp, of_port); + if (ret) + goto err_port; + } + +err_port: + list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list) + list_del(&port->list); + + return ret; +} + +static int s32g_pcie_get_resources(struct platform_device *pdev, + struct s32g_pcie *s32g_pp) +{ + struct dw_pcie *pci = &s32g_pp->pci; + struct device *dev = &pdev->dev; + int ret; + + pci->dev = dev; + pci->ops = &s32g_pcie_ops; + + s32g_pp->ctrl_base = devm_platform_ioremap_resource_byname(pdev, "ctrl"); + if (IS_ERR(s32g_pp->ctrl_base)) + return PTR_ERR(s32g_pp->ctrl_base); + + INIT_LIST_HEAD(&s32g_pp->ports); + + ret = s32g_pcie_parse_ports(dev, s32g_pp); + if (ret) + return dev_err_probe(dev, ret, + "Failed to parse Root Port: %d\n", ret); + + platform_set_drvdata(pdev, s32g_pp); + + return 0; +} + +static int s32g_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct s32g_pcie *s32g_pp; + struct dw_pcie_rp *pp; + int ret; + + s32g_pp = devm_kzalloc(dev, sizeof(*s32g_pp), GFP_KERNEL); + if (!s32g_pp) + return -ENOMEM; + + ret = s32g_pcie_get_resources(pdev, s32g_pp); + if (ret) + return ret; + + pm_runtime_no_callbacks(dev); + devm_pm_runtime_enable(dev); + ret = pm_runtime_get_sync(dev); + if (ret < 0) + goto err_pm_runtime_put; + + ret = s32g_pcie_init(dev, s32g_pp); + if (ret) + goto err_pm_runtime_put; + + pp = &s32g_pp->pci.pp; + pp->ops = &s32g_pcie_host_ops; + pp->use_atu_msg = true; + + ret = dw_pcie_host_init(pp); + if (ret) + goto err_pcie_deinit; + + return 0; + +err_pcie_deinit: + s32g_pcie_deinit(s32g_pp); +err_pm_runtime_put: + pm_runtime_put(dev); + + return ret; +} + +static int s32g_pcie_suspend_noirq(struct device *dev) +{ + struct s32g_pcie *s32g_pp = dev_get_drvdata(dev); + struct dw_pcie *pci = &s32g_pp->pci; + + return dw_pcie_suspend_noirq(pci); +} + +static int s32g_pcie_resume_noirq(struct device *dev) +{ + struct s32g_pcie *s32g_pp = dev_get_drvdata(dev); + struct dw_pcie *pci = &s32g_pp->pci; + + return dw_pcie_resume_noirq(pci); +} + +static const struct dev_pm_ops s32g_pcie_pm_ops = { + NOIRQ_SYSTEM_SLEEP_PM_OPS(s32g_pcie_suspend_noirq, + s32g_pcie_resume_noirq) +}; + +static const struct of_device_id s32g_pcie_of_match[] = { + { .compatible = "nxp,s32g2-pcie" }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, s32g_pcie_of_match); + +static struct platform_driver s32g_pcie_driver = { + .driver = { + .name = "s32g-pcie", + .of_match_table = s32g_pcie_of_match, + .suppress_bind_attrs = true, + .pm = pm_sleep_ptr(&s32g_pcie_pm_ops), + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .probe = s32g_pcie_probe, +}; + +builtin_platform_driver(s32g_pcie_driver); + +MODULE_AUTHOR("Ionut Vicovan <Ionut.Vicovan@nxp.com>"); +MODULE_DESCRIPTION("NXP S32G PCIe Host controller driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index c48a20602d7f..7b92e7a1c0d9 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -641,6 +641,18 @@ static int qcom_pcie_post_init_1_0_0(struct qcom_pcie *pcie) return 0; } +static int qcom_pcie_assert_perst(struct dw_pcie *pci, bool assert) +{ + struct qcom_pcie *pcie = to_qcom_pcie(pci); + + if (assert) + qcom_ep_reset_assert(pcie); + else + qcom_ep_reset_deassert(pcie); + + return 0; +} + static void qcom_pcie_2_3_2_ltssm_enable(struct qcom_pcie *pcie) { u32 val; @@ -1012,6 +1024,8 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie) val &= ~REQ_NOT_ENTR_L1; writel(val, pcie->parf + PARF_PM_CTRL); + pci->l1ss_support = true; + val = readl(pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2); val |= EN; writel(val, pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2); @@ -1480,6 +1494,7 @@ static const struct qcom_pcie_cfg cfg_fw_managed = { static const struct dw_pcie_ops dw_pcie_ops = { .link_up = qcom_pcie_link_up, .start_link = qcom_pcie_start_link, + .assert_perst = qcom_pcie_assert_perst, }; static int qcom_pcie_icc_init(struct qcom_pcie *pcie) @@ -1529,6 +1544,7 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie) { u32 offset, status, width, speed; struct dw_pcie *pci = pcie->pci; + struct dev_pm_opp_key key = {}; unsigned long freq_kbps; struct dev_pm_opp *opp; int ret, freq_mbps; @@ -1556,8 +1572,20 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie) return; freq_kbps = freq_mbps * KILO; - opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width, - true); + opp = dev_pm_opp_find_level_exact(pci->dev, speed); + if (IS_ERR(opp)) { + /* opp-level is not defined use only frequency */ + opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width, + true); + } else { + /* put opp-level OPP */ + dev_pm_opp_put(opp); + + key.freq = freq_kbps * width; + key.level = speed; + key.bw = 0; + opp = dev_pm_opp_find_key_exact(pci->dev, &key, true); + } if (!IS_ERR(opp)) { ret = dev_pm_opp_set_opp(pci->dev, opp); if (ret) diff --git a/drivers/pci/controller/dwc/pcie-spacemit-k1.c b/drivers/pci/controller/dwc/pcie-spacemit-k1.c new file mode 100644 index 000000000000..be20a520255b --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-spacemit-k1.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SpacemiT K1 PCIe host driver + * + * Copyright (C) 2025 by RISCstar Solutions Corporation. All rights reserved. + * Copyright (c) 2023, spacemit Corporation. + */ + +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/mfd/syscon.h> +#include <linux/mod_devicetable.h> +#include <linux/phy/phy.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> +#include <linux/reset.h> +#include <linux/types.h> + +#include "pcie-designware.h" + +#define PCI_VENDOR_ID_SPACEMIT 0x201f +#define PCI_DEVICE_ID_SPACEMIT_K1 0x0001 + +/* Offsets and field definitions for link management registers */ +#define K1_PHY_AHB_IRQ_EN 0x0000 +#define PCIE_INTERRUPT_EN BIT(0) + +#define K1_PHY_AHB_LINK_STS 0x0004 +#define SMLH_LINK_UP BIT(1) +#define RDLH_LINK_UP BIT(12) + +#define INTR_ENABLE 0x0014 +#define MSI_CTRL_INT BIT(11) + +/* Some controls require APMU regmap access */ +#define SYSCON_APMU "spacemit,apmu" + +/* Offsets and field definitions for APMU registers */ +#define PCIE_CLK_RESET_CONTROL 0x0000 +#define LTSSM_EN BIT(6) +#define PCIE_AUX_PWR_DET BIT(9) +#define PCIE_RC_PERST BIT(12) /* 1: assert PERST# */ +#define APP_HOLD_PHY_RST BIT(30) +#define DEVICE_TYPE_RC BIT(31) /* 0: endpoint; 1: RC */ + +#define PCIE_CONTROL_LOGIC 0x0004 +#define PCIE_SOFT_RESET BIT(0) + +struct k1_pcie { + struct dw_pcie pci; + struct phy *phy; + void __iomem *link; + struct regmap *pmu; /* Errors ignored; MMIO-backed regmap */ + u32 pmu_off; +}; + +#define to_k1_pcie(dw_pcie) \ + platform_get_drvdata(to_platform_device((dw_pcie)->dev)) + +static void k1_pcie_toggle_soft_reset(struct k1_pcie *k1) +{ + u32 offset; + u32 val; + + /* + * Write, then read back to guarantee it has reached the device + * before we start the delay. + */ + offset = k1->pmu_off + PCIE_CONTROL_LOGIC; + regmap_set_bits(k1->pmu, offset, PCIE_SOFT_RESET); + regmap_read(k1->pmu, offset, &val); + + mdelay(2); + + regmap_clear_bits(k1->pmu, offset, PCIE_SOFT_RESET); +} + +/* Enable app clocks, deassert resets */ +static int k1_pcie_enable_resources(struct k1_pcie *k1) +{ + struct dw_pcie *pci = &k1->pci; + int ret; + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(pci->app_clks), pci->app_clks); + if (ret) + return ret; + + ret = reset_control_bulk_deassert(ARRAY_SIZE(pci->app_rsts), + pci->app_rsts); + if (ret) + goto err_disable_clks; + + return 0; + +err_disable_clks: + clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks); + + return ret; +} + +/* Assert resets, disable app clocks */ +static void k1_pcie_disable_resources(struct k1_pcie *k1) +{ + struct dw_pcie *pci = &k1->pci; + + reset_control_bulk_assert(ARRAY_SIZE(pci->app_rsts), pci->app_rsts); + clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks); +} + +/* FIXME: Disable ASPM L1 to avoid errors reported on some NVMe drives */ +static void k1_pcie_disable_aspm_l1(struct k1_pcie *k1) +{ + struct dw_pcie *pci = &k1->pci; + u8 offset; + u32 val; + + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + offset += PCI_EXP_LNKCAP; + + dw_pcie_dbi_ro_wr_en(pci); + val = dw_pcie_readl_dbi(pci, offset); + val &= ~PCI_EXP_LNKCAP_ASPM_L1; + dw_pcie_writel_dbi(pci, offset, val); + dw_pcie_dbi_ro_wr_dis(pci); +} + +static int k1_pcie_init(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct k1_pcie *k1 = to_k1_pcie(pci); + u32 reset_ctrl; + u32 val; + int ret; + + k1_pcie_toggle_soft_reset(k1); + + ret = k1_pcie_enable_resources(k1); + if (ret) + return ret; + + /* Set the PCI vendor and device ID */ + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, PCI_VENDOR_ID_SPACEMIT); + dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, PCI_DEVICE_ID_SPACEMIT_K1); + dw_pcie_dbi_ro_wr_dis(pci); + + /* + * Start by asserting fundamental reset (drive PERST# low). The + * PCI CEM spec says that PERST# should be deasserted at least + * 100ms after the power becomes stable, so we'll insert that + * delay first. Write, then read it back to guarantee the write + * reaches the device before we start the delay. + */ + reset_ctrl = k1->pmu_off + PCIE_CLK_RESET_CONTROL; + regmap_set_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST); + regmap_read(k1->pmu, reset_ctrl, &val); + mdelay(PCIE_T_PVPERL_MS); + + /* + * Put the controller in root complex mode, and indicate that + * Vaux (3.3v) is present. + */ + regmap_set_bits(k1->pmu, reset_ctrl, DEVICE_TYPE_RC | PCIE_AUX_PWR_DET); + + ret = phy_init(k1->phy); + if (ret) { + k1_pcie_disable_resources(k1); + + return ret; + } + + /* Deassert fundamental reset (drive PERST# high) */ + regmap_clear_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST); + + /* Finally, as a workaround, disable ASPM L1 */ + k1_pcie_disable_aspm_l1(k1); + + return 0; +} + +static void k1_pcie_deinit(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct k1_pcie *k1 = to_k1_pcie(pci); + + /* Assert fundamental reset (drive PERST# low) */ + regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL, + PCIE_RC_PERST); + + phy_exit(k1->phy); + + k1_pcie_disable_resources(k1); +} + +static const struct dw_pcie_host_ops k1_pcie_host_ops = { + .init = k1_pcie_init, + .deinit = k1_pcie_deinit, +}; + +static bool k1_pcie_link_up(struct dw_pcie *pci) +{ + struct k1_pcie *k1 = to_k1_pcie(pci); + u32 val; + + val = readl_relaxed(k1->link + K1_PHY_AHB_LINK_STS); + + return (val & RDLH_LINK_UP) && (val & SMLH_LINK_UP); +} + +static int k1_pcie_start_link(struct dw_pcie *pci) +{ + struct k1_pcie *k1 = to_k1_pcie(pci); + u32 val; + + /* Stop holding the PHY in reset, and enable link training */ + regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL, + APP_HOLD_PHY_RST | LTSSM_EN, LTSSM_EN); + + /* Enable the MSI interrupt */ + writel_relaxed(MSI_CTRL_INT, k1->link + INTR_ENABLE); + + /* Top-level interrupt enable */ + val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN); + val |= PCIE_INTERRUPT_EN; + writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN); + + return 0; +} + +static void k1_pcie_stop_link(struct dw_pcie *pci) +{ + struct k1_pcie *k1 = to_k1_pcie(pci); + u32 val; + + /* Disable interrupts */ + val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN); + val &= ~PCIE_INTERRUPT_EN; + writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN); + + writel_relaxed(0, k1->link + INTR_ENABLE); + + /* Disable the link and hold the PHY in reset */ + regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL, + APP_HOLD_PHY_RST | LTSSM_EN, APP_HOLD_PHY_RST); +} + +static const struct dw_pcie_ops k1_pcie_ops = { + .link_up = k1_pcie_link_up, + .start_link = k1_pcie_start_link, + .stop_link = k1_pcie_stop_link, +}; + +static int k1_pcie_parse_port(struct k1_pcie *k1) +{ + struct device *dev = k1->pci.dev; + struct device_node *root_port; + struct phy *phy; + + /* We assume only one root port */ + root_port = of_get_next_available_child(dev_of_node(dev), NULL); + if (!root_port) + return -EINVAL; + + phy = devm_of_phy_get(dev, root_port, NULL); + + of_node_put(root_port); + + if (IS_ERR(phy)) + return PTR_ERR(phy); + + k1->phy = phy; + + return 0; +} + +static int k1_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct k1_pcie *k1; + int ret; + + k1 = devm_kzalloc(dev, sizeof(*k1), GFP_KERNEL); + if (!k1) + return -ENOMEM; + + k1->pmu = syscon_regmap_lookup_by_phandle_args(dev_of_node(dev), + SYSCON_APMU, 1, + &k1->pmu_off); + if (IS_ERR(k1->pmu)) + return dev_err_probe(dev, PTR_ERR(k1->pmu), + "failed to lookup PMU registers\n"); + + k1->link = devm_platform_ioremap_resource_byname(pdev, "link"); + if (IS_ERR(k1->link)) + return dev_err_probe(dev, PTR_ERR(k1->link), + "failed to map \"link\" registers\n"); + + k1->pci.dev = dev; + k1->pci.ops = &k1_pcie_ops; + k1->pci.pp.num_vectors = MAX_MSI_IRQS; + dw_pcie_cap_set(&k1->pci, REQ_RES); + + k1->pci.pp.ops = &k1_pcie_host_ops; + + /* Hold the PHY in reset until we start the link */ + regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL, + APP_HOLD_PHY_RST); + + ret = devm_regulator_get_enable(dev, "vpcie3v3"); + if (ret) + return dev_err_probe(dev, ret, + "failed to get \"vpcie3v3\" supply\n"); + + pm_runtime_set_active(dev); + pm_runtime_no_callbacks(dev); + devm_pm_runtime_enable(dev); + + platform_set_drvdata(pdev, k1); + + ret = k1_pcie_parse_port(k1); + if (ret) + return dev_err_probe(dev, ret, "failed to parse root port\n"); + + ret = dw_pcie_host_init(&k1->pci.pp); + if (ret) + return dev_err_probe(dev, ret, "failed to initialize host\n"); + + return 0; +} + +static void k1_pcie_remove(struct platform_device *pdev) +{ + struct k1_pcie *k1 = platform_get_drvdata(pdev); + + dw_pcie_host_deinit(&k1->pci.pp); +} + +static const struct of_device_id k1_pcie_of_match_table[] = { + { .compatible = "spacemit,k1-pcie", }, + { } +}; + +static struct platform_driver k1_pcie_driver = { + .probe = k1_pcie_probe, + .remove = k1_pcie_remove, + .driver = { + .name = "spacemit-k1-pcie", + .of_match_table = k1_pcie_of_match_table, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, +}; +module_platform_driver(k1_pcie_driver); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SpacemiT K1 PCIe host driver"); diff --git a/drivers/pci/controller/dwc/pcie-stm32-ep.c b/drivers/pci/controller/dwc/pcie-stm32-ep.c index 3400c7cd2d88..2cecf32d2b0f 100644 --- a/drivers/pci/controller/dwc/pcie-stm32-ep.c +++ b/drivers/pci/controller/dwc/pcie-stm32-ep.c @@ -7,9 +7,9 @@ */ #include <linux/clk.h> +#include <linux/gpio/consumer.h> #include <linux/mfd/syscon.h> #include <linux/of_platform.h> -#include <linux/of_gpio.h> #include <linux/phy/phy.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> @@ -37,36 +37,9 @@ static void stm32_pcie_ep_init(struct dw_pcie_ep *ep) dw_pcie_ep_reset_bar(pci, bar); } -static int stm32_pcie_enable_link(struct dw_pcie *pci) -{ - struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci); - - regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, - STM32MP25_PCIECR_LTSSM_EN, - STM32MP25_PCIECR_LTSSM_EN); - - return dw_pcie_wait_for_link(pci); -} - -static void stm32_pcie_disable_link(struct dw_pcie *pci) -{ - struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci); - - regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, STM32MP25_PCIECR_LTSSM_EN, 0); -} - static int stm32_pcie_start_link(struct dw_pcie *pci) { struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci); - int ret; - - dev_dbg(pci->dev, "Enable link\n"); - - ret = stm32_pcie_enable_link(pci); - if (ret) { - dev_err(pci->dev, "PCIe cannot establish link: %d\n", ret); - return ret; - } enable_irq(stm32_pcie->perst_irq); @@ -77,11 +50,7 @@ static void stm32_pcie_stop_link(struct dw_pcie *pci) { struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci); - dev_dbg(pci->dev, "Disable link\n"); - disable_irq(stm32_pcie->perst_irq); - - stm32_pcie_disable_link(pci); } static int stm32_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no, @@ -152,6 +121,9 @@ static void stm32_pcie_perst_assert(struct dw_pcie *pci) dev_dbg(dev, "PERST asserted by host\n"); + regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, + STM32MP25_PCIECR_LTSSM_EN, 0); + pci_epc_deinit_notify(ep->epc); stm32_pcie_disable_resources(stm32_pcie); @@ -192,6 +164,11 @@ static void stm32_pcie_perst_deassert(struct dw_pcie *pci) pci_epc_init_notify(ep->epc); + /* Enable link training */ + regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, + STM32MP25_PCIECR_LTSSM_EN, + STM32MP25_PCIECR_LTSSM_EN); + return; err_disable_resources: @@ -237,6 +214,8 @@ static int stm32_add_pcie_ep(struct stm32_pcie *stm32_pcie, ep->ops = &stm32_pcie_ep_ops; + ep->page_size = stm32_pcie_epc_features.align; + ret = dw_pcie_ep_init(ep); if (ret) { dev_err(dev, "Failed to initialize ep: %d\n", ret); diff --git a/drivers/pci/controller/dwc/pcie-stm32.c b/drivers/pci/controller/dwc/pcie-stm32.c index 96a5fb893af4..a9e77478443b 100644 --- a/drivers/pci/controller/dwc/pcie-stm32.c +++ b/drivers/pci/controller/dwc/pcie-stm32.c @@ -7,18 +7,30 @@ */ #include <linux/clk.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/gpio/consumer.h> +#include <linux/irq.h> #include <linux/mfd/syscon.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/of.h> #include <linux/of_platform.h> #include <linux/phy/phy.h> #include <linux/pinctrl/consumer.h> #include <linux/platform_device.h> +#include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/regmap.h> #include <linux/reset.h> +#include <linux/stddef.h> + +#include "../../pci.h" + #include "pcie-designware.h" #include "pcie-stm32.h" -#include "../../pci.h" struct stm32_pcie { struct dw_pcie pci; diff --git a/drivers/pci/controller/dwc/pcie-stm32.h b/drivers/pci/controller/dwc/pcie-stm32.h index 09d39f04e469..419cf1ff669d 100644 --- a/drivers/pci/controller/dwc/pcie-stm32.h +++ b/drivers/pci/controller/dwc/pcie-stm32.h @@ -6,6 +6,9 @@ * Author: Christian Bruel <christian.bruel@foss.st.com> */ +#include <linux/bits.h> +#include <linux/device.h> + #define to_stm32_pcie(x) dev_get_drvdata((x)->dev) #define STM32MP25_PCIECR_TYPE_MASK GENMASK(11, 8) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 10e74458e667..0ddeef70726d 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -260,7 +260,6 @@ struct tegra_pcie_dw { u32 msi_ctrl_int; u32 num_lanes; u32 cid; - u32 cfg_link_cap_l1sub; u32 ras_des_cap; u32 pcie_cap_base; u32 aspm_cmrt; @@ -475,8 +474,7 @@ static irqreturn_t tegra_pcie_ep_irq_thread(int irq, void *arg) return IRQ_HANDLED; /* If EP doesn't advertise L1SS, just return */ - val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub); - if (!(val & (PCI_L1SS_CAP_ASPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_2))) + if (!pci->l1ss_support) return IRQ_HANDLED; /* Check if BME is set to '1' */ @@ -608,24 +606,6 @@ static struct pci_ops tegra_pci_ops = { }; #if defined(CONFIG_PCIEASPM) -static void disable_aspm_l11(struct tegra_pcie_dw *pcie) -{ - u32 val; - - val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); - val &= ~PCI_L1SS_CAP_ASPM_L1_1; - dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); -} - -static void disable_aspm_l12(struct tegra_pcie_dw *pcie) -{ - u32 val; - - val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); - val &= ~PCI_L1SS_CAP_ASPM_L1_2; - dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); -} - static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event) { u32 val; @@ -682,10 +662,9 @@ static int aspm_state_cnt(struct seq_file *s, void *data) static void init_host_aspm(struct tegra_pcie_dw *pcie) { struct dw_pcie *pci = &pcie->pci; - u32 val; + u32 l1ss, val; - val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); - pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP; + l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); pcie->ras_des_cap = dw_pcie_find_ext_capability(&pcie->pci, PCI_EXT_CAP_ID_VNDR); @@ -697,11 +676,14 @@ static void init_host_aspm(struct tegra_pcie_dw *pcie) PCIE_RAS_DES_EVENT_COUNTER_CONTROL, val); /* Program T_cmrt and T_pwr_on values */ - val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub); + val = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP); val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE); val |= (pcie->aspm_cmrt << 8); val |= (pcie->aspm_pwr_on_t << 19); - dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val); + dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, val); + + if (pcie->supports_clkreq) + pci->l1ss_support = true; /* Program L0s and L1 entrance latencies */ val = dw_pcie_readl_dbi(pci, PCIE_PORT_AFR); @@ -726,8 +708,6 @@ static void init_debugfs(struct tegra_pcie_dw *pcie) aspm_state_cnt); } #else -static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; } -static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; } static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; } static inline void init_debugfs(struct tegra_pcie_dw *pcie) { return; } #endif @@ -931,12 +911,6 @@ static int tegra_pcie_dw_host_init(struct dw_pcie_rp *pp) init_host_aspm(pcie); - /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */ - if (!pcie->supports_clkreq) { - disable_aspm_l11(pcie); - disable_aspm_l12(pcie); - } - if (!pcie->of_data->has_l1ss_exit_fix) { val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL; @@ -1871,12 +1845,6 @@ static void pex_ep_event_pex_rst_deassert(struct tegra_pcie_dw *pcie) init_host_aspm(pcie); - /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */ - if (!pcie->supports_clkreq) { - disable_aspm_l11(pcie); - disable_aspm_l12(pcie); - } - if (!pcie->of_data->has_l1ss_exit_fix) { val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL; diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 810d1c8de24e..c473e7c03bac 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -53,16 +53,12 @@ struct pci_config_window *pci_host_common_ecam_create(struct device *dev, EXPORT_SYMBOL_GPL(pci_host_common_ecam_create); int pci_host_common_init(struct platform_device *pdev, + struct pci_host_bridge *bridge, const struct pci_ecam_ops *ops) { struct device *dev = &pdev->dev; - struct pci_host_bridge *bridge; struct pci_config_window *cfg; - bridge = devm_pci_alloc_host_bridge(dev, 0); - if (!bridge) - return -ENOMEM; - of_pci_check_probe_only(); platform_set_drvdata(pdev, bridge); @@ -85,12 +81,17 @@ EXPORT_SYMBOL_GPL(pci_host_common_init); int pci_host_common_probe(struct platform_device *pdev) { const struct pci_ecam_ops *ops; + struct pci_host_bridge *bridge; ops = of_device_get_match_data(&pdev->dev); if (!ops) return -ENODEV; - return pci_host_common_init(pdev, ops); + bridge = devm_pci_alloc_host_bridge(&pdev->dev, 0); + if (!bridge) + return -ENOMEM; + + return pci_host_common_init(pdev, bridge, ops); } EXPORT_SYMBOL_GPL(pci_host_common_probe); diff --git a/drivers/pci/controller/pci-host-common.h b/drivers/pci/controller/pci-host-common.h index 51c35ec0cf37..b5075d4bd7eb 100644 --- a/drivers/pci/controller/pci-host-common.h +++ b/drivers/pci/controller/pci-host-common.h @@ -14,6 +14,7 @@ struct pci_ecam_ops; int pci_host_common_probe(struct platform_device *pdev); int pci_host_common_init(struct platform_device *pdev, + struct pci_host_bridge *bridge, const struct pci_ecam_ops *ops); void pci_host_common_remove(struct platform_device *pdev); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 146b43981b27..1e237d3538f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev) return 0; } -#define HVPCI_DOM_MAP_SIZE (64 * 1024) -static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); - -/* - * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 - * as invalid for passthrough PCI devices of this driver. - */ -#define HVPCI_DOM_INVALID 0 - -/** - * hv_get_dom_num() - Get a valid PCI domain number - * Check if the PCI domain number is in use, and return another number if - * it is in use. - * - * @dom: Requested domain number - * - * return: domain number on success, HVPCI_DOM_INVALID on failure - */ -static u16 hv_get_dom_num(u16 dom) -{ - unsigned int i; - - if (test_and_set_bit(dom, hvpci_dom_map) == 0) - return dom; - - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { - if (test_and_set_bit(i, hvpci_dom_map) == 0) - return i; - } - - return HVPCI_DOM_INVALID; -} - -/** - * hv_put_dom_num() - Mark the PCI domain number as free - * @dom: Domain number to be freed - */ -static void hv_put_dom_num(u16 dom) -{ - clear_bit(dom, hvpci_dom_map); -} - /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev, { struct pci_host_bridge *bridge; struct hv_pcibus_device *hbus; - u16 dom_req, dom; + int ret, dom; + u16 dom_req; char *name; - int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); if (!bridge) @@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev, * PCI bus (which is actually emulated by the hypervisor) is domain 0. * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. + * + * Because Gen1 VMs use domain 0, don't allow picking domain 0 here, + * even if bytes 4 and 5 of the instance GUID are both zero. For wider + * userspace compatibility, limit the domain ID to a 16-bit value. */ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; - dom = hv_get_dom_num(dom_req); - - if (dom == HVPCI_DOM_INVALID) { + dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX); + if (dom < 0) { dev_err(&hdev->device, "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; @@ -3917,7 +3878,7 @@ close: destroy_wq: destroy_workqueue(hbus->wq); free_dom: - hv_put_dom_num(hbus->bridge->domain_nr); + pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr); free_bus: kfree(hbus); return ret; @@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev) irq_domain_remove(hbus->irq_domain); irq_domain_free_fwnode(hbus->fwnode); - hv_put_dom_num(hbus->bridge->domain_nr); - kfree(hbus); } @@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void) if (ret) return ret; - /* Set the invalid domain number's bit, so it will not be used */ - set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); - /* Initialize PCI block r/w interface */ hvpci_block_ops.read_block = hv_read_config_block; hvpci_block_ops.write_block = hv_write_config_block; diff --git a/drivers/pci/controller/pci-ixp4xx.c b/drivers/pci/controller/pci-ixp4xx.c index acb85e0d5675..9fd401838bad 100644 --- a/drivers/pci/controller/pci-ixp4xx.c +++ b/drivers/pci/controller/pci-ixp4xx.c @@ -214,6 +214,7 @@ static u32 ixp4xx_crp_byte_lane_enable_bits(u32 n, int size) return 0xffffffff; } +#ifdef CONFIG_ARM static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size, u32 *value) { @@ -251,6 +252,7 @@ static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size, return PCIBIOS_SUCCESSFUL; } +#endif static int ixp4xx_crp_write_config(struct ixp4xx_pci *p, int where, int size, u32 value) @@ -470,6 +472,7 @@ static int ixp4xx_pci_parse_map_dma_ranges(struct ixp4xx_pci *p) return 0; } +#ifdef CONFIG_ARM /* Only used to get context for abort handling */ static struct ixp4xx_pci *ixp4xx_pci_abort_singleton; @@ -509,6 +512,7 @@ static int ixp4xx_pci_abort_handler(unsigned long addr, unsigned int fsr, return 0; } +#endif static int __init ixp4xx_pci_probe(struct platform_device *pdev) { @@ -555,10 +559,12 @@ static int __init ixp4xx_pci_probe(struct platform_device *pdev) dev_info(dev, "controller is in %s mode\n", p->host_mode ? "host" : "option"); +#ifdef CONFIG_ARM /* Hook in our fault handler for PCI errors */ ixp4xx_pci_abort_singleton = p; hook_fault_code(16+6, ixp4xx_pci_abort_handler, SIGBUS, 0, "imprecise external abort"); +#endif ret = ixp4xx_pci_parse_map_ranges(p); if (ret) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index 0380d300adca..2d92fc79f6dd 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -187,7 +187,6 @@ struct apple_pcie { const struct hw_info *hw; unsigned long *bitmap; struct list_head ports; - struct list_head entry; struct completion event; struct irq_fwspec fwspec; u32 nvecs; @@ -206,9 +205,6 @@ struct apple_pcie_port { int idx; }; -static LIST_HEAD(pcie_list); -static DEFINE_MUTEX(pcie_list_lock); - static void rmw_set(u32 set, void __iomem *addr) { writel_relaxed(readl_relaxed(addr) | set, addr); @@ -724,32 +720,9 @@ static int apple_msi_init(struct apple_pcie *pcie) return 0; } -static void apple_pcie_register(struct apple_pcie *pcie) -{ - guard(mutex)(&pcie_list_lock); - - list_add_tail(&pcie->entry, &pcie_list); -} - -static void apple_pcie_unregister(struct apple_pcie *pcie) -{ - guard(mutex)(&pcie_list_lock); - - list_del(&pcie->entry); -} - static struct apple_pcie *apple_pcie_lookup(struct device *dev) { - struct apple_pcie *pcie; - - guard(mutex)(&pcie_list_lock); - - list_for_each_entry(pcie, &pcie_list, entry) { - if (pcie->dev == dev) - return pcie; - } - - return NULL; + return pci_host_bridge_priv(dev_get_drvdata(dev)); } static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev) @@ -875,13 +848,15 @@ static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = { static int apple_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct pci_host_bridge *bridge; struct apple_pcie *pcie; int ret; - pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); - if (!pcie) + bridge = devm_pci_alloc_host_bridge(dev, sizeof(*pcie)); + if (!bridge) return -ENOMEM; + pcie = pci_host_bridge_priv(bridge); pcie->dev = dev; pcie->hw = of_device_get_match_data(dev); if (!pcie->hw) @@ -897,13 +872,7 @@ static int apple_pcie_probe(struct platform_device *pdev) if (ret) return ret; - apple_pcie_register(pcie); - - ret = pci_host_common_init(pdev, &apple_pcie_cfg_ecam_ops); - if (ret) - apple_pcie_unregister(pcie); - - return ret; + return pci_host_common_init(pdev, bridge, &apple_pcie_cfg_ecam_ops); } static const struct of_device_id apple_pcie_of_match[] = { diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 9afbd02ded35..062f55690012 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -14,15 +14,18 @@ #include <linux/irqchip/chained_irq.h> #include <linux/irqchip/irq-msi-lib.h> #include <linux/irqdomain.h> +#include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/module.h> #include <linux/msi.h> +#include <linux/notifier.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/of_pci.h> #include <linux/of_platform.h> +#include <linux/panic_notifier.h> #include <linux/pci.h> #include <linux/pci-ecam.h> #include <linux/printk.h> @@ -30,7 +33,9 @@ #include <linux/reset.h> #include <linux/sizes.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include "../pci.h" @@ -48,7 +53,6 @@ #define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY 0x04dc #define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_MAX_LINK_WIDTH_MASK 0x1f0 -#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK 0xc00 #define PCIE_RC_CFG_PRIV1_ROOT_CAP 0x4f8 #define PCIE_RC_CFG_PRIV1_ROOT_CAP_L1SS_MODE_MASK 0xf8 @@ -155,8 +159,40 @@ #define MSI_INT_MASK_SET 0x10 #define MSI_INT_MASK_CLR 0x14 +/* Error report registers */ +#define PCIE_OUTB_ERR_TREAT 0x6000 +#define PCIE_OUTB_ERR_TREAT_CONFIG 0x1 +#define PCIE_OUTB_ERR_TREAT_MEM 0x2 +#define PCIE_OUTB_ERR_VALID 0x6004 +#define PCIE_OUTB_ERR_CLEAR 0x6008 +#define PCIE_OUTB_ERR_ACC_INFO 0x600c +#define PCIE_OUTB_ERR_ACC_INFO_CFG_ERR BIT(0) +#define PCIE_OUTB_ERR_ACC_INFO_MEM_ERR BIT(1) +#define PCIE_OUTB_ERR_ACC_INFO_TYPE_64 BIT(2) +#define PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE BIT(4) +#define PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES 0xff00 +#define PCIE_OUTB_ERR_ACC_ADDR 0x6010 +#define PCIE_OUTB_ERR_ACC_ADDR_BUS 0xff00000 +#define PCIE_OUTB_ERR_ACC_ADDR_DEV 0xf8000 +#define PCIE_OUTB_ERR_ACC_ADDR_FUNC 0x7000 +#define PCIE_OUTB_ERR_ACC_ADDR_REG 0xfff +#define PCIE_OUTB_ERR_CFG_CAUSE 0x6014 +#define PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT BIT(6) +#define PCIE_OUTB_ERR_CFG_CAUSE_ABORT BIT(5) +#define PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ BIT(4) +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT BIT(2) +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED BIT(1) +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT BIT(0) +#define PCIE_OUTB_ERR_MEM_ADDR_LO 0x6018 +#define PCIE_OUTB_ERR_MEM_ADDR_HI 0x601c +#define PCIE_OUTB_ERR_MEM_CAUSE 0x6020 +#define PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT BIT(6) +#define PCIE_OUTB_ERR_MEM_CAUSE_ABORT BIT(5) +#define PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ BIT(4) +#define PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED BIT(1) +#define PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR BIT(0) + #define PCIE_RGR1_SW_INIT_1_PERST_MASK 0x1 -#define PCIE_RGR1_SW_INIT_1_PERST_SHIFT 0x0 #define RGR1_SW_INIT_1_INIT_GENERIC_MASK 0x2 #define RGR1_SW_INIT_1_INIT_GENERIC_SHIFT 0x1 @@ -259,6 +295,7 @@ struct pcie_cfg_data { int (*perst_set)(struct brcm_pcie *pcie, u32 val); int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val); int (*post_setup)(struct brcm_pcie *pcie); + bool has_err_report; }; struct subdev_regulators { @@ -303,6 +340,10 @@ struct brcm_pcie { struct subdev_regulators *sr; bool ep_wakeup_capable; const struct pcie_cfg_data *cfg; + bool bridge_in_reset; + struct notifier_block die_notifier; + struct notifier_block panic_notifier; + spinlock_t bridge_lock; }; static inline bool is_bmips(const struct brcm_pcie *pcie) @@ -310,6 +351,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie) return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425; } +static int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val) +{ + unsigned long flags; + int ret; + + if (pcie->cfg->has_err_report) + spin_lock_irqsave(&pcie->bridge_lock, flags); + + ret = pcie->cfg->bridge_sw_init_set(pcie, val); + /* If we fail, assume the bridge is in reset (off) */ + pcie->bridge_in_reset = ret ? true : val; + + if (pcie->cfg->has_err_report) + spin_unlock_irqrestore(&pcie->bridge_lock, flags); + + return ret; +} + /* * This is to convert the size of the inbound "BAR" region to the * non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE @@ -1075,13 +1134,13 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) void __iomem *base = pcie->base; struct pci_host_bridge *bridge; struct resource_entry *entry; - u32 tmp, burst, aspm_support, num_lanes, num_lanes_cap; + u32 tmp, burst, num_lanes, num_lanes_cap; u8 num_out_wins = 0; int num_inbound_wins = 0; int memc, ret; /* Reset the bridge */ - ret = pcie->cfg->bridge_sw_init_set(pcie, 1); + ret = brcm_pcie_bridge_sw_init_set(pcie, 1); if (ret) return ret; @@ -1097,7 +1156,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) usleep_range(100, 200); /* Take the bridge out of reset */ - ret = pcie->cfg->bridge_sw_init_set(pcie, 0); + ret = brcm_pcie_bridge_sw_init_set(pcie, 0); if (ret) return ret; @@ -1175,12 +1234,9 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) /* Don't advertise L0s capability if 'aspm-no-l0s' */ - aspm_support = PCIE_LINK_STATE_L1; - if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) - aspm_support |= PCIE_LINK_STATE_L0S; tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); - u32p_replace_bits(&tmp, aspm_support, - PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); + if (of_property_read_bool(pcie->np, "aspm-no-l0s")) + tmp &= ~PCI_EXP_LNKCAP_ASPM_L0S; writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); /* 'tmp' still holds the contents of PRIV1_LINK_CAPABILITY */ @@ -1565,7 +1621,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie) if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN)) /* Shutdown PCIe bridge */ - ret = pcie->cfg->bridge_sw_init_set(pcie, 1); + ret = brcm_pcie_bridge_sw_init_set(pcie, 1); return ret; } @@ -1653,7 +1709,9 @@ static int brcm_pcie_resume_noirq(struct device *dev) goto err_reset; /* Take bridge out of reset so we can access the SERDES reg */ - pcie->cfg->bridge_sw_init_set(pcie, 0); + ret = brcm_pcie_bridge_sw_init_set(pcie, 0); + if (ret) + goto err_reset; /* SERDES_IDDQ = 0 */ tmp = readl(base + HARD_DEBUG(pcie)); @@ -1707,6 +1765,119 @@ err_disable_clk: return ret; } +/* Dump out PCIe errors on die or panic */ +static int brcm_pcie_dump_err(struct brcm_pcie *pcie, + const char *type) +{ + void __iomem *base = pcie->base; + int i, is_cfg_err, is_mem_err, lanes; + const char *width_str, *direction_str; + u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi; + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); + unsigned long flags; + char lanes_str[9]; + + spin_lock_irqsave(&pcie->bridge_lock, flags); + /* Don't access registers when the bridge is off */ + if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) { + spin_unlock_irqrestore(&pcie->bridge_lock, flags); + return NOTIFY_DONE; + } + + /* Read all necessary registers so we can release the spinlock ASAP */ + info = readl(base + PCIE_OUTB_ERR_ACC_INFO); + is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR); + is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR); + if (is_cfg_err) { + cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR); + cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE); + } + if (is_mem_err) { + mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE); + lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO); + hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI); + } + /* We've got all of the info, clear the error */ + writel(1, base + PCIE_OUTB_ERR_CLEAR); + spin_unlock_irqrestore(&pcie->bridge_lock, flags); + + dev_err(pcie->dev, "reporting PCIe info which may be related to %s error\n", + type); + width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64) ? "64bit" : "32bit"; + direction_str = str_read_write(!(info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE)); + lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES, info); + for (i = 0, lanes_str[8] = 0; i < 8; i++) + lanes_str[i] = (lanes & (1 << i)) ? '1' : '0'; + + if (is_cfg_err) { + int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS, cfg_addr); + int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV, cfg_addr); + int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC, cfg_addr); + int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG, cfg_addr); + + dev_err(pcie->dev, "Error: CFG Acc, %s, %s (%04x:%02x:%02x.%d) reg=0x%x, lanes=%s\n", + width_str, direction_str, bridge->domain_nr, bus, dev, + func, reg, lanes_str); + dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n", + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT), + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT), + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ), + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT), + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED), + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT)); + } + + if (is_mem_err) { + u64 addr = ((u64)hi << 32) | (u64)lo; + + dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n", + width_str, direction_str, addr, lanes_str); + dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n", + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT), + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT), + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ), + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED), + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR)); + } + + return NOTIFY_DONE; +} + +static int brcm_pcie_die_notify_cb(struct notifier_block *self, + unsigned long v, void *p) +{ + struct brcm_pcie *pcie = + container_of(self, struct brcm_pcie, die_notifier); + + return brcm_pcie_dump_err(pcie, "Die"); +} + +static int brcm_pcie_panic_notify_cb(struct notifier_block *self, + unsigned long v, void *p) +{ + struct brcm_pcie *pcie = + container_of(self, struct brcm_pcie, panic_notifier); + + return brcm_pcie_dump_err(pcie, "Panic"); +} + +static void brcm_register_die_notifiers(struct brcm_pcie *pcie) +{ + pcie->panic_notifier.notifier_call = brcm_pcie_panic_notify_cb; + atomic_notifier_chain_register(&panic_notifier_list, + &pcie->panic_notifier); + + pcie->die_notifier.notifier_call = brcm_pcie_die_notify_cb; + register_die_notifier(&pcie->die_notifier); +} + +static void brcm_unregister_die_notifiers(struct brcm_pcie *pcie) +{ + unregister_die_notifier(&pcie->die_notifier); + atomic_notifier_chain_unregister(&panic_notifier_list, + &pcie->panic_notifier); +} + static void __brcm_pcie_remove(struct brcm_pcie *pcie) { brcm_msi_remove(pcie); @@ -1725,6 +1896,9 @@ static void brcm_pcie_remove(struct platform_device *pdev) pci_stop_root_bus(bridge->bus); pci_remove_root_bus(bridge->bus); + if (pcie->cfg->has_err_report) + brcm_unregister_die_notifiers(pcie); + __brcm_pcie_remove(pcie); } @@ -1825,6 +1999,7 @@ static const struct pcie_cfg_data bcm7216_cfg = { .bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_7278, .has_phy = true, .num_inbound_wins = 3, + .has_err_report = true, }; static const struct pcie_cfg_data bcm7712_cfg = { @@ -1921,7 +2096,10 @@ static int brcm_pcie_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "could not enable clock\n"); - pcie->cfg->bridge_sw_init_set(pcie, 0); + ret = brcm_pcie_bridge_sw_init_set(pcie, 0); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "could not de-assert bridge reset\n"); if (pcie->swinit_reset) { ret = reset_control_assert(pcie->swinit_reset); @@ -1996,6 +2174,11 @@ static int brcm_pcie_probe(struct platform_device *pdev) return ret; } + if (pcie->cfg->has_err_report) { + spin_lock_init(&pcie->bridge_lock); + brcm_register_die_notifiers(pcie); + } + return 0; fail: diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 24cc30a2ab6c..4b78b6528f9f 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -143,23 +143,33 @@ struct mtk_pcie_port; /** + * enum mtk_pcie_quirks - MTK PCIe quirks + * @MTK_PCIE_FIX_CLASS_ID: host's class ID needed to be fixed + * @MTK_PCIE_FIX_DEVICE_ID: host's device ID needed to be fixed + * @MTK_PCIE_NO_MSI: Bridge has no MSI support, and relies on an external block + * @MTK_PCIE_SKIP_RSTB: Skip calling RSTB bits on PCIe probe + */ +enum mtk_pcie_quirks { + MTK_PCIE_FIX_CLASS_ID = BIT(0), + MTK_PCIE_FIX_DEVICE_ID = BIT(1), + MTK_PCIE_NO_MSI = BIT(2), + MTK_PCIE_SKIP_RSTB = BIT(3), +}; + +/** * struct mtk_pcie_soc - differentiate between host generations - * @need_fix_class_id: whether this host's class ID needed to be fixed or not - * @need_fix_device_id: whether this host's device ID needed to be fixed or not - * @no_msi: Bridge has no MSI support, and relies on an external block * @device_id: device ID which this host need to be fixed * @ops: pointer to configuration access functions * @startup: pointer to controller setting functions * @setup_irq: pointer to initialize IRQ functions + * @quirks: PCIe device quirks. */ struct mtk_pcie_soc { - bool need_fix_class_id; - bool need_fix_device_id; - bool no_msi; unsigned int device_id; struct pci_ops *ops; int (*startup)(struct mtk_pcie_port *port); int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node); + enum mtk_pcie_quirks quirks; }; /** @@ -679,31 +689,28 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port) regmap_update_bits(pcie->cfg, PCIE_SYS_CFG_V2, val, val); } - /* Assert all reset signals */ - writel(0, port->base + PCIE_RST_CTRL); + if (!(soc->quirks & MTK_PCIE_SKIP_RSTB)) { + /* Assert all reset signals */ + writel(0, port->base + PCIE_RST_CTRL); - /* - * Enable PCIe link down reset, if link status changed from link up to - * link down, this will reset MAC control registers and configuration - * space. - */ - writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL); + /* + * Enable PCIe link down reset, if link status changed from + * link up to link down, this will reset MAC control registers + * and configuration space. + */ + writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL); - /* - * Described in PCIe CEM specification sections 2.2 (PERST# Signal) and - * 2.2.1 (Initial Power-Up (G3 to S0)). The deassertion of PERST# should - * be delayed 100ms (TPVPERL) for the power and clock to become stable. - */ - msleep(100); + msleep(PCIE_T_PVPERL_MS); - /* De-assert PHY, PE, PIPE, MAC and configuration reset */ - val = readl(port->base + PCIE_RST_CTRL); - val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB | - PCIE_MAC_SRSTB | PCIE_CRSTB; - writel(val, port->base + PCIE_RST_CTRL); + /* De-assert PHY, PE, PIPE, MAC and configuration reset */ + val = readl(port->base + PCIE_RST_CTRL); + val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB | + PCIE_MAC_SRSTB | PCIE_CRSTB; + writel(val, port->base + PCIE_RST_CTRL); + } /* Set up vendor ID and class code */ - if (soc->need_fix_class_id) { + if (soc->quirks & MTK_PCIE_FIX_CLASS_ID) { val = PCI_VENDOR_ID_MEDIATEK; writew(val, port->base + PCIE_CONF_VEND_ID); @@ -711,7 +718,7 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port) writew(val, port->base + PCIE_CONF_CLASS_ID); } - if (soc->need_fix_device_id) + if (soc->quirks & MTK_PCIE_FIX_DEVICE_ID) writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID); /* 100ms timeout value should be enough for Gen1/2 training */ @@ -821,6 +828,41 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port) return 0; } +static int mtk_pcie_startup_port_an7583(struct mtk_pcie_port *port) +{ + struct mtk_pcie *pcie = port->pcie; + struct device *dev = pcie->dev; + struct pci_host_bridge *host; + struct resource_entry *entry; + struct regmap *pbus_regmap; + resource_size_t addr; + u32 args[2], size; + + /* + * Configure PBus base address and base address mask to allow + * the hw to detect if a given address is accessible on PCIe + * controller. + */ + pbus_regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node, + "mediatek,pbus-csr", + ARRAY_SIZE(args), + args); + if (IS_ERR(pbus_regmap)) + return PTR_ERR(pbus_regmap); + + host = pci_host_bridge_from_priv(pcie); + entry = resource_list_first_type(&host->windows, IORESOURCE_MEM); + if (!entry) + return -ENODEV; + + addr = entry->res->start - entry->offset; + regmap_write(pbus_regmap, args[0], lower_32_bits(addr)); + size = lower_32_bits(resource_size(entry->res)); + regmap_write(pbus_regmap, args[1], GENMASK(31, __fls(size))); + + return mtk_pcie_startup_port_v2(port); +} + static void mtk_pcie_enable_port(struct mtk_pcie_port *port) { struct mtk_pcie *pcie = port->pcie; @@ -1099,7 +1141,7 @@ static int mtk_pcie_probe(struct platform_device *pdev) host->ops = pcie->soc->ops; host->sysdata = pcie; - host->msi_domain = pcie->soc->no_msi; + host->msi_domain = !!(pcie->soc->quirks & MTK_PCIE_NO_MSI); err = pci_host_probe(host); if (err) @@ -1187,9 +1229,9 @@ static const struct dev_pm_ops mtk_pcie_pm_ops = { }; static const struct mtk_pcie_soc mtk_pcie_soc_v1 = { - .no_msi = true, .ops = &mtk_pcie_ops, .startup = mtk_pcie_startup_port, + .quirks = MTK_PCIE_NO_MSI, }; static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = { @@ -1199,22 +1241,29 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = { }; static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = { - .need_fix_class_id = true, .ops = &mtk_pcie_ops_v2, .startup = mtk_pcie_startup_port_v2, .setup_irq = mtk_pcie_setup_irq, + .quirks = MTK_PCIE_FIX_CLASS_ID, +}; + +static const struct mtk_pcie_soc mtk_pcie_soc_an7583 = { + .ops = &mtk_pcie_ops_v2, + .startup = mtk_pcie_startup_port_an7583, + .setup_irq = mtk_pcie_setup_irq, + .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_SKIP_RSTB, }; static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = { - .need_fix_class_id = true, - .need_fix_device_id = true, .device_id = PCI_DEVICE_ID_MEDIATEK_7629, .ops = &mtk_pcie_ops_v2, .startup = mtk_pcie_startup_port_v2, .setup_irq = mtk_pcie_setup_irq, + .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_FIX_DEVICE_ID, }; static const struct of_device_id mtk_pcie_ids[] = { + { .compatible = "airoha,an7583-pcie", .data = &mtk_pcie_soc_an7583 }, { .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 }, diff --git a/drivers/pci/controller/pcie-rzg3s-host.c b/drivers/pci/controller/pcie-rzg3s-host.c new file mode 100644 index 000000000000..667e6d629474 --- /dev/null +++ b/drivers/pci/controller/pcie-rzg3s-host.c @@ -0,0 +1,1761 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe driver for Renesas RZ/G3S SoCs + * + * Copyright (C) 2025 Renesas Electronics Corp. + * + * Based on: + * drivers/pci/controller/pcie-rcar-host.c + * Copyright (C) 2009 - 2011 Paul Mundt + */ + +#include <linux/bitfield.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/cleanup.h> +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/iopoll.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqchip/chained_irq.h> +#include <linux/irqchip/irq-msi-lib.h> +#include <linux/irqdomain.h> +#include <linux/kernel.h> +#include <linux/mfd/syscon.h> +#include <linux/mutex.h> +#include <linux/msi.h> +#include <linux/of_irq.h> +#include <linux/pci.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> +#include <linux/regmap.h> +#include <linux/reset.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/units.h> + +#include "../pci.h" + +/* AXI registers */ +#define RZG3S_PCI_REQDATA(id) (0x80 + (id) * 0x4) +#define RZG3S_PCI_REQRCVDAT 0x8c + +#define RZG3S_PCI_REQADR1 0x90 +#define RZG3S_PCI_REQADR1_BUS GENMASK(31, 24) +#define RZG3S_PCI_REQADR1_DEV GENMASK(23, 19) +#define RZG3S_PCI_REQADR1_FUNC GENMASK(18, 16) +#define RZG3S_PCI_REQADR1_REG GENMASK(11, 0) + +#define RZG3S_PCI_REQBE 0x98 +#define RZG3S_PCI_REQBE_BYTE_EN GENMASK(3, 0) + +#define RZG3S_PCI_REQISS 0x9c +#define RZG3S_PCI_REQISS_MOR_STATUS GENMASK(18, 16) +#define RZG3S_PCI_REQISS_TR_TYPE GENMASK(11, 8) +#define RZG3S_PCI_REQISS_TR_TP0_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x4) +#define RZG3S_PCI_REQISS_TR_TP0_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x5) +#define RZG3S_PCI_REQISS_TR_TP1_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x6) +#define RZG3S_PCI_REQISS_TR_TP1_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x7) +#define RZG3S_PCI_REQISS_REQ_ISSUE BIT(0) + +#define RZG3S_PCI_MSIRCVWADRL 0x100 +#define RZG3S_PCI_MSIRCVWADRL_MASK GENMASK(31, 3) +#define RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA BIT(1) +#define RZG3S_PCI_MSIRCVWADRL_ENA BIT(0) + +#define RZG3S_PCI_MSIRCVWADRU 0x104 + +#define RZG3S_PCI_MSIRCVWMSKL 0x108 +#define RZG3S_PCI_MSIRCVWMSKL_MASK GENMASK(31, 2) + +#define RZG3S_PCI_PINTRCVIE 0x110 +#define RZG3S_PCI_PINTRCVIE_INTX(i) BIT(i) +#define RZG3S_PCI_PINTRCVIE_MSI BIT(4) + +#define RZG3S_PCI_PINTRCVIS 0x114 +#define RZG3S_PCI_PINTRCVIS_INTX(i) BIT(i) +#define RZG3S_PCI_PINTRCVIS_MSI BIT(4) + +#define RZG3S_PCI_MSGRCVIE 0x120 +#define RZG3S_PCI_MSGRCVIE_MSG_RCV BIT(24) + +#define RZG3S_PCI_MSGRCVIS 0x124 +#define RZG3S_PCI_MSGRCVIS_MRI BIT(24) + +#define RZG3S_PCI_PEIE0 0x200 + +#define RZG3S_PCI_PEIS0 0x204 +#define RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER BIT(12) +#define RZG3S_PCI_PEIS0_DL_UPDOWN BIT(9) + +#define RZG3S_PCI_PEIE1 0x208 +#define RZG3S_PCI_PEIS1 0x20c +#define RZG3S_PCI_AMEIS 0x214 +#define RZG3S_PCI_ASEIS1 0x224 + +#define RZG3S_PCI_PCSTAT1 0x408 +#define RZG3S_PCI_PCSTAT1_LTSSM_STATE GENMASK(14, 10) +#define RZG3S_PCI_PCSTAT1_DL_DOWN_STS BIT(0) + +#define RZG3S_PCI_PCCTRL2 0x410 +#define RZG3S_PCI_PCCTRL2_LS_CHG GENMASK(9, 8) +#define RZG3S_PCI_PCCTRL2_LS_CHG_REQ BIT(0) + +#define RZG3S_PCI_PCSTAT2 0x414 +#define RZG3S_PCI_PCSTAT2_LS_CHG_DONE BIT(28) +#define RZG3S_PCI_PCSTAT2_SDRIRE GENMASK(7, 1) + +#define RZG3S_PCI_PERM 0x300 +#define RZG3S_PCI_PERM_CFG_HWINIT_EN BIT(2) +#define RZG3S_PCI_PERM_PIPE_PHY_REG_EN BIT(1) + +#define RZG3S_PCI_MSIRE(id) (0x600 + (id) * 0x10) +#define RZG3S_PCI_MSIRE_ENA BIT(0) + +#define RZG3S_PCI_MSIRM(id) (0x608 + (id) * 0x10) +#define RZG3S_PCI_MSIRS(id) (0x60c + (id) * 0x10) + +#define RZG3S_PCI_AWBASEL(id) (0x1000 + (id) * 0x20) +#define RZG3S_PCI_AWBASEL_WIN_ENA BIT(0) + +#define RZG3S_PCI_AWBASEU(id) (0x1004 + (id) * 0x20) +#define RZG3S_PCI_AWMASKL(id) (0x1008 + (id) * 0x20) +#define RZG3S_PCI_AWMASKU(id) (0x100c + (id) * 0x20) +#define RZG3S_PCI_ADESTL(id) (0x1010 + (id) * 0x20) +#define RZG3S_PCI_ADESTU(id) (0x1014 + (id) * 0x20) + +#define RZG3S_PCI_PWBASEL(id) (0x1100 + (id) * 0x20) +#define RZG3S_PCI_PWBASEL_ENA BIT(0) + +#define RZG3S_PCI_PWBASEU(id) (0x1104 + (id) * 0x20) +#define RZG3S_PCI_PDESTL(id) (0x1110 + (id) * 0x20) +#define RZG3S_PCI_PDESTU(id) (0x1114 + (id) * 0x20) +#define RZG3S_PCI_PWMASKL(id) (0x1108 + (id) * 0x20) +#define RZG3S_PCI_PWMASKU(id) (0x110c + (id) * 0x20) + +/* PHY control registers */ +#define RZG3S_PCI_PHY_XCFGD(id) (0x2000 + (id) * 0x10) +#define RZG3S_PCI_PHY_XCFGD_NUM 39 + +#define RZG3S_PCI_PHY_XCFGA_CMN(id) (0x2400 + (id) * 0x10) +#define RZG3S_PCI_PHY_XCFGA_CMN_NUM 16 + +#define RZG3S_PCI_PHY_XCFGA_RX(id) (0x2500 + (id) * 0x10) +#define RZG3S_PCI_PHY_XCFGA_RX_NUM 13 + +#define RZG3S_PCI_PHY_XCFGA_TX 0x25d0 + +#define RZG3S_PCI_PHY_XCFG_CTRL 0x2a20 +#define RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL BIT(0) + +/* PCIe registers */ +#define RZG3S_PCI_CFG_BASE 0x6000 +#define RZG3S_PCI_CFG_BARMSK00L 0xa0 +#define RZG3S_PCI_CFG_BARMSK00U 0xa4 + +#define RZG3S_PCI_CFG_PCIEC 0x60 + +/* System controller registers */ +#define RZG3S_SYS_PCIE_RST_RSM_B 0xd74 +#define RZG3S_SYS_PCIE_RST_RSM_B_MASK BIT(0) + +/* Maximum number of windows */ +#define RZG3S_MAX_WINDOWS 8 + +/* Number of MSI interrupts per register */ +#define RZG3S_PCI_MSI_INT_PER_REG 32 +/* The number of MSI interrupts */ +#define RZG3S_PCI_MSI_INT_NR RZG3S_PCI_MSI_INT_PER_REG + +/* Timeouts experimentally determined */ +#define RZG3S_REQ_ISSUE_TIMEOUT_US 2500 + +/** + * struct rzg3s_pcie_msi - RZ/G3S PCIe MSI data structure + * @domain: IRQ domain + * @map: bitmap with the allocated MSIs + * @dma_addr: address of the allocated MSI window + * @window_base: base address of the MSI window + * @pages: allocated pages for MSI window mapping + * @map_lock: lock for bitmap with the allocated MSIs + * @irq: MSI interrupt + */ +struct rzg3s_pcie_msi { + struct irq_domain *domain; + DECLARE_BITMAP(map, RZG3S_PCI_MSI_INT_NR); + dma_addr_t dma_addr; + dma_addr_t window_base; + unsigned long pages; + struct mutex map_lock; + int irq; +}; + +struct rzg3s_pcie_host; + +/** + * struct rzg3s_pcie_soc_data - SoC specific data + * @init_phy: PHY initialization function + * @power_resets: array with the resets that need to be de-asserted after + * power-on + * @cfg_resets: array with the resets that need to be de-asserted after + * configuration + * @num_power_resets: number of power resets + * @num_cfg_resets: number of configuration resets + */ +struct rzg3s_pcie_soc_data { + int (*init_phy)(struct rzg3s_pcie_host *host); + const char * const *power_resets; + const char * const *cfg_resets; + u8 num_power_resets; + u8 num_cfg_resets; +}; + +/** + * struct rzg3s_pcie_port - RZ/G3S PCIe Root Port data structure + * @refclk: PCIe reference clock + * @vendor_id: Vendor ID + * @device_id: Device ID + */ +struct rzg3s_pcie_port { + struct clk *refclk; + u32 vendor_id; + u32 device_id; +}; + +/** + * struct rzg3s_pcie_host - RZ/G3S PCIe data structure + * @axi: base address for AXI registers + * @pcie: base address for PCIe registers + * @dev: struct device + * @power_resets: reset control signals that should be set after power up + * @cfg_resets: reset control signals that should be set after configuration + * @sysc: SYSC regmap + * @intx_domain: INTx IRQ domain + * @data: SoC specific data + * @msi: MSI data structure + * @port: PCIe Root Port + * @hw_lock: lock for access to the HW resources + * @intx_irqs: INTx interrupts + * @max_link_speed: maximum supported link speed + */ +struct rzg3s_pcie_host { + void __iomem *axi; + void __iomem *pcie; + struct device *dev; + struct reset_control_bulk_data *power_resets; + struct reset_control_bulk_data *cfg_resets; + struct regmap *sysc; + struct irq_domain *intx_domain; + const struct rzg3s_pcie_soc_data *data; + struct rzg3s_pcie_msi msi; + struct rzg3s_pcie_port port; + raw_spinlock_t hw_lock; + int intx_irqs[PCI_NUM_INTX]; + int max_link_speed; +}; + +#define rzg3s_msi_to_host(_msi) container_of(_msi, struct rzg3s_pcie_host, msi) + +static void rzg3s_pcie_update_bits(void __iomem *base, u32 offset, u32 mask, + u32 val) +{ + u32 tmp; + + tmp = readl_relaxed(base + offset); + tmp &= ~mask; + tmp |= val & mask; + writel_relaxed(tmp, base + offset); +} + +static int rzg3s_pcie_child_issue_request(struct rzg3s_pcie_host *host) +{ + u32 val; + int ret; + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_REQISS, + RZG3S_PCI_REQISS_REQ_ISSUE, + RZG3S_PCI_REQISS_REQ_ISSUE); + ret = readl_poll_timeout_atomic(host->axi + RZG3S_PCI_REQISS, val, + !(val & RZG3S_PCI_REQISS_REQ_ISSUE), + 5, RZG3S_REQ_ISSUE_TIMEOUT_US); + + if (val & RZG3S_PCI_REQISS_MOR_STATUS) + return -EIO; + + return ret; +} + +static void rzg3s_pcie_child_prepare_bus(struct pci_bus *bus, + unsigned int devfn, int where) +{ + struct rzg3s_pcie_host *host = bus->sysdata; + unsigned int dev, func, reg; + + dev = PCI_SLOT(devfn); + func = PCI_FUNC(devfn); + reg = where & ~0x3; + + /* Set the destination */ + writel_relaxed(FIELD_PREP(RZG3S_PCI_REQADR1_BUS, bus->number) | + FIELD_PREP(RZG3S_PCI_REQADR1_DEV, dev) | + FIELD_PREP(RZG3S_PCI_REQADR1_FUNC, func) | + FIELD_PREP(RZG3S_PCI_REQADR1_REG, reg), + host->axi + RZG3S_PCI_REQADR1); + + /* Set byte enable */ + writel_relaxed(RZG3S_PCI_REQBE_BYTE_EN, host->axi + RZG3S_PCI_REQBE); +} + +static int rzg3s_pcie_child_read_conf(struct rzg3s_pcie_host *host, + struct pci_bus *bus, unsigned int devfn, + int where, u32 *data) +{ + bool type0 = pci_is_root_bus(bus->parent) ? true : false; + int ret; + + rzg3s_pcie_child_prepare_bus(bus, devfn, where); + + /* Set the type of request */ + writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_RD : + RZG3S_PCI_REQISS_TR_TP1_RD, + host->axi + RZG3S_PCI_REQISS); + + /* Issue the request and wait to finish */ + ret = rzg3s_pcie_child_issue_request(host); + if (ret) + return PCIBIOS_SET_FAILED; + + /* Read the data */ + *data = readl_relaxed(host->axi + RZG3S_PCI_REQRCVDAT); + + return PCIBIOS_SUCCESSFUL; +} + +/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */ +static int rzg3s_pcie_child_read(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *val) +{ + struct rzg3s_pcie_host *host = bus->sysdata; + int ret; + + ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, val); + if (ret != PCIBIOS_SUCCESSFUL) + return ret; + + if (size <= 2) + *val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1); + + return PCIBIOS_SUCCESSFUL; +} + +static int rzg3s_pcie_child_write_conf(struct rzg3s_pcie_host *host, + struct pci_bus *bus, unsigned int devfn, + int where, u32 data) +{ + bool type0 = pci_is_root_bus(bus->parent) ? true : false; + int ret; + + rzg3s_pcie_child_prepare_bus(bus, devfn, where); + + /* Set the write data */ + writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(0)); + writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(1)); + writel_relaxed(data, host->axi + RZG3S_PCI_REQDATA(2)); + + /* Set the type of request */ + writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_WR : + RZG3S_PCI_REQISS_TR_TP1_WR, + host->axi + RZG3S_PCI_REQISS); + + /* Issue the request and wait to finish */ + ret = rzg3s_pcie_child_issue_request(host); + if (ret) + return PCIBIOS_SET_FAILED; + + return PCIBIOS_SUCCESSFUL; +} + +/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */ +static int rzg3s_pcie_child_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + struct rzg3s_pcie_host *host = bus->sysdata; + u32 data, shift; + int ret; + + if (size == 4) + return rzg3s_pcie_child_write_conf(host, bus, devfn, where, val); + + /* + * Controller does 32 bit accesses. To do byte accesses software need + * to do read/modify/write. This may have potential side effects. For + * example, software may perform a 16-bit write. If the hardware only + * supports 32-bit accesses, we must do a 32-bit read, merge in the 16 + * bits we intend to write, followed by a 32-bit write. If the 16 bits + * we *don't* intend to write happen to have any RW1C + * (write-one-to-clear) bits set, we just inadvertently cleared + * something we shouldn't have. + */ + if (!bus->unsafe_warn) { + dev_warn(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where); + bus->unsafe_warn = 1; + } + + ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, &data); + if (ret != PCIBIOS_SUCCESSFUL) + return ret; + + if (size == 1) { + shift = BITS_PER_BYTE * (where & 3); + data &= ~(0xff << shift); + data |= ((val & 0xff) << shift); + } else if (size == 2) { + shift = BITS_PER_BYTE * (where & 2); + data &= ~(0xffff << shift); + data |= ((val & 0xffff) << shift); + } else { + data = val; + } + + return rzg3s_pcie_child_write_conf(host, bus, devfn, where, data); +} + +static struct pci_ops rzg3s_pcie_child_ops = { + .read = rzg3s_pcie_child_read, + .write = rzg3s_pcie_child_write, +}; + +static void __iomem *rzg3s_pcie_root_map_bus(struct pci_bus *bus, + unsigned int devfn, int where) +{ + struct rzg3s_pcie_host *host = bus->sysdata; + + if (devfn) + return NULL; + + return host->pcie + where; +} + +/* Serialized by 'pci_lock' */ +static int rzg3s_pcie_root_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + struct rzg3s_pcie_host *host = bus->sysdata; + int ret; + + /* Enable access control to the CFGU */ + writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN, + host->axi + RZG3S_PCI_PERM); + + ret = pci_generic_config_write(bus, devfn, where, size, val); + + /* Disable access control to the CFGU */ + writel_relaxed(0, host->axi + RZG3S_PCI_PERM); + + return ret; +} + +static struct pci_ops rzg3s_pcie_root_ops = { + .read = pci_generic_config_read, + .write = rzg3s_pcie_root_write, + .map_bus = rzg3s_pcie_root_map_bus, +}; + +static void rzg3s_pcie_intx_irq_handler(struct irq_desc *desc) +{ + struct rzg3s_pcie_host *host = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int irq = irq_desc_get_irq(desc); + u32 intx = irq - host->intx_irqs[0]; + + chained_irq_enter(chip, desc); + generic_handle_domain_irq(host->intx_domain, intx); + chained_irq_exit(chip, desc); +} + +static irqreturn_t rzg3s_pcie_msi_irq(int irq, void *data) +{ + u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG; + DECLARE_BITMAP(bitmap, RZG3S_PCI_MSI_INT_NR); + struct rzg3s_pcie_host *host = data; + struct rzg3s_pcie_msi *msi = &host->msi; + unsigned long bit; + u32 status; + + status = readl_relaxed(host->axi + RZG3S_PCI_PINTRCVIS); + if (!(status & RZG3S_PCI_PINTRCVIS_MSI)) + return IRQ_NONE; + + /* Clear the MSI */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS, + RZG3S_PCI_PINTRCVIS_MSI, + RZG3S_PCI_PINTRCVIS_MSI); + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIS, + RZG3S_PCI_MSGRCVIS_MRI, RZG3S_PCI_MSGRCVIS_MRI); + + for (u8 reg_id = 0; reg_id < regs; reg_id++) { + status = readl_relaxed(host->axi + RZG3S_PCI_MSIRS(reg_id)); + bitmap_write(bitmap, status, reg_id * RZG3S_PCI_MSI_INT_PER_REG, + RZG3S_PCI_MSI_INT_PER_REG); + } + + for_each_set_bit(bit, bitmap, RZG3S_PCI_MSI_INT_NR) { + int ret; + + ret = generic_handle_domain_irq(msi->domain, bit); + if (ret) { + u8 reg_bit = bit % RZG3S_PCI_MSI_INT_PER_REG; + u8 reg_id = bit / RZG3S_PCI_MSI_INT_PER_REG; + + /* Unknown MSI, just clear it */ + writel_relaxed(BIT(reg_bit), + host->axi + RZG3S_PCI_MSIRS(reg_id)); + } + } + + return IRQ_HANDLED; +} + +static void rzg3s_pcie_msi_irq_ack(struct irq_data *d) +{ + struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d); + struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi); + u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG; + u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG; + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + writel_relaxed(BIT(reg_bit), host->axi + RZG3S_PCI_MSIRS(reg_id)); +} + +static void rzg3s_pcie_msi_irq_mask(struct irq_data *d) +{ + struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d); + struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi); + u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG; + u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG; + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit), + BIT(reg_bit)); +} + +static void rzg3s_pcie_msi_irq_unmask(struct irq_data *d) +{ + struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d); + struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi); + u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG; + u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG; + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit), + 0); +} + +static void rzg3s_pcie_irq_compose_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(data); + struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi); + u32 lo, hi; + + /* + * Enable and msg data enable bits are part of the address lo. Drop + * them along with the unused bit. + */ + lo = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRL) & + RZG3S_PCI_MSIRCVWADRL_MASK; + hi = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRU); + + msg->address_lo = lo; + msg->address_hi = hi; + msg->data = data->hwirq; +} + +static struct irq_chip rzg3s_pcie_msi_bottom_chip = { + .name = "rzg3s-pcie-msi", + .irq_ack = rzg3s_pcie_msi_irq_ack, + .irq_mask = rzg3s_pcie_msi_irq_mask, + .irq_unmask = rzg3s_pcie_msi_irq_unmask, + .irq_compose_msi_msg = rzg3s_pcie_irq_compose_msi_msg, +}; + +static int rzg3s_pcie_msi_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + struct rzg3s_pcie_msi *msi = domain->host_data; + int hwirq; + + scoped_guard(mutex, &msi->map_lock) { + hwirq = bitmap_find_free_region(msi->map, RZG3S_PCI_MSI_INT_NR, + order_base_2(nr_irqs)); + } + + if (hwirq < 0) + return -ENOSPC; + + for (unsigned int i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &rzg3s_pcie_msi_bottom_chip, + domain->host_data, handle_edge_irq, NULL, + NULL); + } + + return 0; +} + +static void rzg3s_pcie_msi_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct rzg3s_pcie_msi *msi = domain->host_data; + + guard(mutex)(&msi->map_lock); + + bitmap_release_region(msi->map, d->hwirq, order_base_2(nr_irqs)); +} + +static const struct irq_domain_ops rzg3s_pcie_msi_domain_ops = { + .alloc = rzg3s_pcie_msi_domain_alloc, + .free = rzg3s_pcie_msi_domain_free, +}; + +#define RZG3S_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_NO_AFFINITY | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) + +#define RZG3S_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \ + MSI_GENERIC_FLAGS_MASK) + +static const struct msi_parent_ops rzg3s_pcie_msi_parent_ops = { + .required_flags = RZG3S_PCIE_MSI_FLAGS_REQUIRED, + .supported_flags = RZG3S_PCIE_MSI_FLAGS_SUPPORTED, + .bus_select_token = DOMAIN_BUS_PCI_MSI, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .prefix = "RZG3S-", + .init_dev_msi_info = msi_lib_init_dev_msi_info, +}; + +static int rzg3s_pcie_msi_allocate_domains(struct rzg3s_pcie_msi *msi) +{ + struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi); + struct device *dev = host->dev; + struct irq_domain_info info = { + .fwnode = dev_fwnode(dev), + .ops = &rzg3s_pcie_msi_domain_ops, + .size = RZG3S_PCI_MSI_INT_NR, + .host_data = msi, + }; + + msi->domain = msi_create_parent_irq_domain(&info, + &rzg3s_pcie_msi_parent_ops); + if (!msi->domain) + return dev_err_probe(dev, -ENOMEM, + "failed to create IRQ domain\n"); + + return 0; +} + +static int rzg3s_pcie_msi_hw_setup(struct rzg3s_pcie_host *host) +{ + u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG; + struct rzg3s_pcie_msi *msi = &host->msi; + + /* + * Set MSI window size. HW will set the window to + * RZG3S_PCI_MSI_INT_NR * 4 bytes. + */ + writel_relaxed(FIELD_PREP(RZG3S_PCI_MSIRCVWMSKL_MASK, + RZG3S_PCI_MSI_INT_NR - 1), + host->axi + RZG3S_PCI_MSIRCVWMSKL); + + /* Set MSI window address and enable MSI window */ + writel_relaxed(upper_32_bits(msi->window_base), + host->axi + RZG3S_PCI_MSIRCVWADRU); + writel_relaxed(lower_32_bits(msi->window_base) | + RZG3S_PCI_MSIRCVWADRL_ENA | + RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA, + host->axi + RZG3S_PCI_MSIRCVWADRL); + + /* Set MSI receive enable */ + for (u8 reg_id = 0; reg_id < regs; reg_id++) { + writel_relaxed(RZG3S_PCI_MSIRE_ENA, + host->axi + RZG3S_PCI_MSIRE(reg_id)); + } + + /* Enable message receive interrupts */ + writel_relaxed(RZG3S_PCI_MSGRCVIE_MSG_RCV, + host->axi + RZG3S_PCI_MSGRCVIE); + + /* Enable MSI */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE, + RZG3S_PCI_PINTRCVIE_MSI, + RZG3S_PCI_PINTRCVIE_MSI); + + return 0; +} + +static int rzg3s_pcie_msi_setup(struct rzg3s_pcie_host *host) +{ + size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32); + struct rzg3s_pcie_msi *msi = &host->msi; + struct device *dev = host->dev; + int id, ret; + + msi->pages = __get_free_pages(GFP_KERNEL | GFP_DMA, 0); + if (!msi->pages) + return -ENOMEM; + + msi->dma_addr = dma_map_single(dev, (void *)msi->pages, size * 2, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, msi->dma_addr)) { + ret = -ENOMEM; + goto free_pages; + } + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section 34.4.5.2 Setting + * the MSI Window) the MSI window needs to fall within one of the + * enabled AXI windows. Find an enabled AXI window to setup the MSI + * window. + */ + for (id = 0; id < RZG3S_MAX_WINDOWS; id++) { + u64 base, basel, baseu; + u64 mask, maskl, masku; + + basel = readl_relaxed(host->axi + RZG3S_PCI_AWBASEL(id)); + /* Skip checking this AXI window if it's not enabled */ + if (!(basel & RZG3S_PCI_AWBASEL_WIN_ENA)) + continue; + + baseu = readl_relaxed(host->axi + RZG3S_PCI_AWBASEU(id)); + base = baseu << 32 | basel; + + maskl = readl_relaxed(host->axi + RZG3S_PCI_AWMASKL(id)); + masku = readl_relaxed(host->axi + RZG3S_PCI_AWMASKU(id)); + mask = masku << 32 | maskl; + + if (msi->dma_addr < base || msi->dma_addr > base + mask) + continue; + + break; + } + + if (id == RZG3S_MAX_WINDOWS) { + ret = -EINVAL; + goto dma_unmap; + } + + /* The MSI base address must be aligned to the MSI size */ + msi->window_base = ALIGN(msi->dma_addr, size); + if (msi->window_base < msi->dma_addr) { + ret = -EINVAL; + goto dma_unmap; + } + + rzg3s_pcie_msi_hw_setup(host); + + return 0; + +dma_unmap: + dma_unmap_single(dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL); +free_pages: + free_pages(msi->pages, 0); + return ret; +} + +static void rzg3s_pcie_msi_hw_teardown(struct rzg3s_pcie_host *host) +{ + u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG; + + /* Disable MSI */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE, + RZG3S_PCI_PINTRCVIE_MSI, 0); + + /* Disable message receive interrupts */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIE, + RZG3S_PCI_MSGRCVIE_MSG_RCV, 0); + + /* Disable MSI receive enable */ + for (u8 reg_id = 0; reg_id < regs; reg_id++) + writel_relaxed(0, host->axi + RZG3S_PCI_MSIRE(reg_id)); + + /* Disable MSI window */ + writel_relaxed(0, host->axi + RZG3S_PCI_MSIRCVWADRL); +} + +static void rzg3s_pcie_teardown_msi(struct rzg3s_pcie_host *host) +{ + size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32); + struct rzg3s_pcie_msi *msi = &host->msi; + + rzg3s_pcie_msi_hw_teardown(host); + + free_irq(msi->irq, host); + irq_domain_remove(msi->domain); + + /* Free unused memory */ + dma_unmap_single(host->dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL); + free_pages(msi->pages, 0); +} + +static int rzg3s_pcie_init_msi(struct rzg3s_pcie_host *host) +{ + struct platform_device *pdev = to_platform_device(host->dev); + struct rzg3s_pcie_msi *msi = &host->msi; + struct device *dev = host->dev; + const char *devname; + int ret; + + ret = devm_mutex_init(dev, &msi->map_lock); + if (ret) + return ret; + + msi->irq = platform_get_irq_byname(pdev, "msi"); + if (msi->irq < 0) + return dev_err_probe(dev, msi->irq, "Failed to get MSI IRQ!\n"); + + devname = devm_kasprintf(dev, GFP_KERNEL, "%s-msi", dev_name(dev)); + if (!devname) + return -ENOMEM; + + ret = rzg3s_pcie_msi_allocate_domains(msi); + if (ret) + return ret; + + /* + * Don't use devm_request_irq() as the driver uses non-devm helpers + * to control clocks. Mixing them may lead to subtle bugs. + */ + ret = request_irq(msi->irq, rzg3s_pcie_msi_irq, 0, devname, host); + if (ret) { + dev_err_probe(dev, ret, "Failed to request IRQ: %d\n", ret); + goto free_domains; + } + + ret = rzg3s_pcie_msi_setup(host); + if (ret) { + dev_err_probe(dev, ret, "Failed to setup MSI!\n"); + goto free_irq; + } + + return 0; + +free_irq: + free_irq(msi->irq, host); +free_domains: + irq_domain_remove(msi->domain); + return ret; +} + +static void rzg3s_pcie_intx_irq_ack(struct irq_data *d) +{ + struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d); + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS, + RZG3S_PCI_PINTRCVIS_INTX(d->hwirq), + RZG3S_PCI_PINTRCVIS_INTX(d->hwirq)); +} + +static void rzg3s_pcie_intx_irq_mask(struct irq_data *d) +{ + struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d); + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE, + RZG3S_PCI_PINTRCVIE_INTX(d->hwirq), 0); +} + +static void rzg3s_pcie_intx_irq_unmask(struct irq_data *d) +{ + struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d); + + guard(raw_spinlock_irqsave)(&host->hw_lock); + + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE, + RZG3S_PCI_PINTRCVIE_INTX(d->hwirq), + RZG3S_PCI_PINTRCVIE_INTX(d->hwirq)); +} + +static struct irq_chip rzg3s_pcie_intx_irq_chip = { + .name = "PCIe INTx", + .irq_ack = rzg3s_pcie_intx_irq_ack, + .irq_mask = rzg3s_pcie_intx_irq_mask, + .irq_unmask = rzg3s_pcie_intx_irq_unmask, +}; + +static int rzg3s_pcie_intx_map(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq) +{ + irq_set_chip_and_handler(irq, &rzg3s_pcie_intx_irq_chip, + handle_level_irq); + irq_set_chip_data(irq, domain->host_data); + + return 0; +} + +static const struct irq_domain_ops rzg3s_pcie_intx_domain_ops = { + .map = rzg3s_pcie_intx_map, + .xlate = irq_domain_xlate_onetwocell, +}; + +static int rzg3s_pcie_init_irqdomain(struct rzg3s_pcie_host *host) +{ + struct device *dev = host->dev; + struct platform_device *pdev = to_platform_device(dev); + + for (int i = 0; i < PCI_NUM_INTX; i++) { + char irq_name[5] = {0}; + int irq; + + scnprintf(irq_name, ARRAY_SIZE(irq_name), "int%c", 'a' + i); + + irq = platform_get_irq_byname(pdev, irq_name); + if (irq < 0) + return dev_err_probe(dev, -EINVAL, + "Failed to parse and map INT%c IRQ\n", + 'A' + i); + + host->intx_irqs[i] = irq; + irq_set_chained_handler_and_data(irq, + rzg3s_pcie_intx_irq_handler, + host); + } + + host->intx_domain = irq_domain_create_linear(dev_fwnode(dev), + PCI_NUM_INTX, + &rzg3s_pcie_intx_domain_ops, + host); + if (!host->intx_domain) + return dev_err_probe(dev, -EINVAL, + "Failed to add irq domain for INTx IRQs\n"); + irq_domain_update_bus_token(host->intx_domain, DOMAIN_BUS_WIRED); + + if (IS_ENABLED(CONFIG_PCI_MSI)) { + int ret = rzg3s_pcie_init_msi(host); + + if (ret) { + irq_domain_remove(host->intx_domain); + return ret; + } + } + + return 0; +} + +static void rzg3s_pcie_teardown_irqdomain(struct rzg3s_pcie_host *host) +{ + if (IS_ENABLED(CONFIG_PCI_MSI)) + rzg3s_pcie_teardown_msi(host); + + irq_domain_remove(host->intx_domain); +} + +static int rzg3s_pcie_set_max_link_speed(struct rzg3s_pcie_host *host) +{ + u32 remote_supported_link_speeds, max_supported_link_speeds; + u32 cs2, tmp, pcie_cap = RZG3S_PCI_CFG_PCIEC; + u32 cur_link_speed, link_speed; + u8 ltssm_state_l0 = 0xc; + int ret; + u16 ls; + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution + * when Changing the Speed Spontaneously) link speed change can be done + * only when the LTSSM is in L0. + */ + ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, tmp, + FIELD_GET(RZG3S_PCI_PCSTAT1_LTSSM_STATE, tmp) == ltssm_state_l0, + PCIE_LINK_WAIT_SLEEP_MS * MILLI, + PCIE_LINK_WAIT_SLEEP_MS * MILLI * + PCIE_LINK_WAIT_MAX_RETRIES); + if (ret) + return ret; + + ls = readw_relaxed(host->pcie + pcie_cap + PCI_EXP_LNKSTA); + cs2 = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2); + + switch (pcie_link_speed[host->max_link_speed]) { + case PCIE_SPEED_5_0GT: + max_supported_link_speeds = GENMASK(PCI_EXP_LNKSTA_CLS_5_0GB - 1, 0); + link_speed = PCI_EXP_LNKCTL2_TLS_5_0GT; + break; + default: + /* Should not happen */ + return -EINVAL; + } + + cur_link_speed = FIELD_GET(PCI_EXP_LNKSTA_CLS, ls); + remote_supported_link_speeds = FIELD_GET(RZG3S_PCI_PCSTAT2_SDRIRE, cs2); + /* Drop reserved bits */ + remote_supported_link_speeds &= max_supported_link_speeds; + + /* + * Return if max link speed is already set or the connected device + * doesn't support it. + */ + if (cur_link_speed == host->max_link_speed || + remote_supported_link_speeds != max_supported_link_speeds) + return 0; + + /* Set target Link speed */ + rzg3s_pcie_update_bits(host->pcie, pcie_cap + PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + FIELD_PREP(PCI_EXP_LNKCTL2_TLS, link_speed)); + + /* Request link speed change */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2, + RZG3S_PCI_PCCTRL2_LS_CHG_REQ | + RZG3S_PCI_PCCTRL2_LS_CHG, + RZG3S_PCI_PCCTRL2_LS_CHG_REQ | + FIELD_PREP(RZG3S_PCI_PCCTRL2_LS_CHG, + link_speed - 1)); + + ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT2, cs2, + (cs2 & RZG3S_PCI_PCSTAT2_LS_CHG_DONE), + PCIE_LINK_WAIT_SLEEP_MS * MILLI, + PCIE_LINK_WAIT_SLEEP_MS * MILLI * + PCIE_LINK_WAIT_MAX_RETRIES); + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution + * when Changing the Speed Spontaneously) the PCI_PCCTRL2_LS_CHG_REQ + * should be de-asserted after checking for PCI_PCSTAT2_LS_CHG_DONE. + */ + rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2, + RZG3S_PCI_PCCTRL2_LS_CHG_REQ, 0); + + return ret; +} + +static int rzg3s_pcie_config_init(struct rzg3s_pcie_host *host) +{ + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host); + struct resource_entry *ft; + struct resource *bus; + u8 subordinate_bus; + u8 secondary_bus; + u8 primary_bus; + + ft = resource_list_first_type(&bridge->windows, IORESOURCE_BUS); + if (!ft) + return -ENODEV; + + bus = ft->res; + primary_bus = bus->start; + secondary_bus = bus->start + 1; + subordinate_bus = bus->end; + + /* Enable access control to the CFGU */ + writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN, + host->axi + RZG3S_PCI_PERM); + + /* HW manual recommends to write 0xffffffff on initialization */ + writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00L); + writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00U); + + /* Update bus info */ + writeb_relaxed(primary_bus, host->pcie + PCI_PRIMARY_BUS); + writeb_relaxed(secondary_bus, host->pcie + PCI_SECONDARY_BUS); + writeb_relaxed(subordinate_bus, host->pcie + PCI_SUBORDINATE_BUS); + + /* Disable access control to the CFGU */ + writel_relaxed(0, host->axi + RZG3S_PCI_PERM); + + return 0; +} + +static void rzg3s_pcie_irq_init(struct rzg3s_pcie_host *host) +{ + /* + * According to the HW manual of the RZ/G3S (Rev.1.10, sections + * corresponding to all registers written with ~0U), the hardware + * ignores value written to unused bits. Writing ~0U to these registers + * should be safe. + */ + + /* Clear the link state and PM transitions */ + writel_relaxed(RZG3S_PCI_PEIS0_DL_UPDOWN | + RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER, + host->axi + RZG3S_PCI_PEIS0); + + /* Disable all interrupts */ + writel_relaxed(0, host->axi + RZG3S_PCI_PEIE0); + + /* Clear all parity and ecc error interrupts */ + writel_relaxed(~0U, host->axi + RZG3S_PCI_PEIS1); + + /* Disable all parity and ecc error interrupts */ + writel_relaxed(0, host->axi + RZG3S_PCI_PEIE1); + + /* Clear all AXI master error interrupts */ + writel_relaxed(~0U, host->axi + RZG3S_PCI_AMEIS); + + /* Clear all AXI slave error interrupts */ + writel_relaxed(~0U, host->axi + RZG3S_PCI_ASEIS1); + + /* Clear all message receive interrupts */ + writel_relaxed(~0U, host->axi + RZG3S_PCI_MSGRCVIS); +} + +static int rzg3s_pcie_power_resets_deassert(struct rzg3s_pcie_host *host) +{ + const struct rzg3s_pcie_soc_data *data = host->data; + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section + * 34.5.1.2 De-asserting the Reset) the PCIe IP needs to wait 5ms from + * power on to the de-assertion of reset. + */ + fsleep(5000); + return reset_control_bulk_deassert(data->num_power_resets, + host->power_resets); +} + +static int rzg3s_pcie_resets_prepare_and_get(struct rzg3s_pcie_host *host) +{ + const struct rzg3s_pcie_soc_data *data = host->data; + unsigned int i; + int ret; + + host->power_resets = devm_kmalloc_array(host->dev, + data->num_power_resets, + sizeof(*host->power_resets), + GFP_KERNEL); + if (!host->power_resets) + return -ENOMEM; + + for (i = 0; i < data->num_power_resets; i++) + host->power_resets[i].id = data->power_resets[i]; + + host->cfg_resets = devm_kmalloc_array(host->dev, + data->num_cfg_resets, + sizeof(*host->cfg_resets), + GFP_KERNEL); + if (!host->cfg_resets) + return -ENOMEM; + + for (i = 0; i < data->num_cfg_resets; i++) + host->cfg_resets[i].id = data->cfg_resets[i]; + + ret = devm_reset_control_bulk_get_exclusive(host->dev, + data->num_power_resets, + host->power_resets); + if (ret) + return ret; + + return devm_reset_control_bulk_get_exclusive(host->dev, + data->num_cfg_resets, + host->cfg_resets); +} + +static int rzg3s_pcie_host_parse_port(struct rzg3s_pcie_host *host) +{ + struct device_node *of_port = of_get_next_child(host->dev->of_node, NULL); + struct rzg3s_pcie_port *port = &host->port; + int ret; + + ret = of_property_read_u32(of_port, "vendor-id", &port->vendor_id); + if (ret) + return ret; + + ret = of_property_read_u32(of_port, "device-id", &port->device_id); + if (ret) + return ret; + + port->refclk = of_clk_get_by_name(of_port, "ref"); + if (IS_ERR(port->refclk)) + return PTR_ERR(port->refclk); + + return 0; +} + +static int rzg3s_pcie_host_init_port(struct rzg3s_pcie_host *host) +{ + struct rzg3s_pcie_port *port = &host->port; + struct device *dev = host->dev; + int ret; + + /* Enable access control to the CFGU */ + writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN, + host->axi + RZG3S_PCI_PERM); + + /* Update vendor ID and device ID */ + writew_relaxed(port->vendor_id, host->pcie + PCI_VENDOR_ID); + writew_relaxed(port->device_id, host->pcie + PCI_DEVICE_ID); + + /* Disable access control to the CFGU */ + writel_relaxed(0, host->axi + RZG3S_PCI_PERM); + + ret = clk_prepare_enable(port->refclk); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable refclk!\n"); + + /* Set the PHY, if any */ + if (host->data->init_phy) { + ret = host->data->init_phy(host); + if (ret) { + dev_err_probe(dev, ret, "Failed to set the PHY!\n"); + goto refclk_disable; + } + } + + return 0; + +refclk_disable: + clk_disable_unprepare(port->refclk); + return ret; +} + +static int rzg3s_pcie_host_init(struct rzg3s_pcie_host *host) +{ + u32 val; + int ret; + + /* Initialize the PCIe related registers */ + ret = rzg3s_pcie_config_init(host); + if (ret) + return ret; + + ret = rzg3s_pcie_host_init_port(host); + if (ret) + return ret; + + /* Initialize the interrupts */ + rzg3s_pcie_irq_init(host); + + ret = reset_control_bulk_deassert(host->data->num_cfg_resets, + host->cfg_resets); + if (ret) + goto disable_port_refclk; + + /* Wait for link up */ + ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, val, + !(val & RZG3S_PCI_PCSTAT1_DL_DOWN_STS), + PCIE_LINK_WAIT_SLEEP_MS * MILLI, + PCIE_LINK_WAIT_SLEEP_MS * MILLI * + PCIE_LINK_WAIT_MAX_RETRIES); + if (ret) + goto cfg_resets_deassert; + + val = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2); + dev_info(host->dev, "PCIe link status [0x%x]\n", val); + + return 0; + +cfg_resets_deassert: + reset_control_bulk_assert(host->data->num_cfg_resets, + host->cfg_resets); +disable_port_refclk: + clk_disable_unprepare(host->port.refclk); + return ret; +} + +static void rzg3s_pcie_set_inbound_window(struct rzg3s_pcie_host *host, + u64 cpu_addr, u64 pci_addr, u64 size, + int id) +{ + /* Set CPU window base address */ + writel_relaxed(upper_32_bits(cpu_addr), + host->axi + RZG3S_PCI_ADESTU(id)); + writel_relaxed(lower_32_bits(cpu_addr), + host->axi + RZG3S_PCI_ADESTL(id)); + + /* Set window size */ + writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_AWMASKU(id)); + writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_AWMASKL(id)); + + /* Set PCIe window base address and enable the window */ + writel_relaxed(upper_32_bits(pci_addr), + host->axi + RZG3S_PCI_AWBASEU(id)); + writel_relaxed(lower_32_bits(pci_addr) | RZG3S_PCI_AWBASEL_WIN_ENA, + host->axi + RZG3S_PCI_AWBASEL(id)); +} + +static int rzg3s_pcie_set_inbound_windows(struct rzg3s_pcie_host *host, + struct resource_entry *entry, + int *index) +{ + u64 pci_addr = entry->res->start - entry->offset; + u64 cpu_addr = entry->res->start; + u64 cpu_end = entry->res->end; + u64 size_id = 0; + int id = *index; + u64 size; + + while (cpu_addr < cpu_end) { + if (id >= RZG3S_MAX_WINDOWS) + return dev_err_probe(host->dev, -ENOSPC, + "Failed to map inbound window for resource (%s)\n", + entry->res->name); + + size = resource_size(entry->res) - size_id; + + /* + * According to the RZ/G3S HW manual (Rev.1.10, + * section 34.3.1.71 AXI Window Mask (Lower) Registers) the min + * size is 4K. + */ + size = max(size, SZ_4K); + + /* + * According the RZ/G3S HW manual (Rev.1.10, sections: + * - 34.3.1.69 AXI Window Base (Lower) Registers + * - 34.3.1.71 AXI Window Mask (Lower) Registers + * - 34.3.1.73 AXI Destination (Lower) Registers) + * the CPU addr, PCIe addr, size should be 4K aligned and be a + * power of 2. + */ + size = ALIGN(size, SZ_4K); + size = roundup_pow_of_two(size); + + cpu_addr = ALIGN(cpu_addr, SZ_4K); + pci_addr = ALIGN(pci_addr, SZ_4K); + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section + * 34.3.1.71 AXI Window Mask (Lower) Registers) HW expects first + * 12 LSB bits to be 0xfff. Subtract 1 from size for this. + */ + rzg3s_pcie_set_inbound_window(host, cpu_addr, pci_addr, + size - 1, id); + + pci_addr += size; + cpu_addr += size; + size_id = size; + id++; + } + *index = id; + + return 0; +} + +static int rzg3s_pcie_parse_map_dma_ranges(struct rzg3s_pcie_host *host) +{ + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host); + struct resource_entry *entry; + int i = 0, ret; + + resource_list_for_each_entry(entry, &bridge->dma_ranges) { + ret = rzg3s_pcie_set_inbound_windows(host, entry, &i); + if (ret) + return ret; + } + + return 0; +} + +static void rzg3s_pcie_set_outbound_window(struct rzg3s_pcie_host *host, + struct resource_entry *win, int id) +{ + struct resource *res = win->res; + resource_size_t size = resource_size(res); + resource_size_t res_start; + + if (res->flags & IORESOURCE_IO) + res_start = pci_pio_to_address(res->start) - win->offset; + else + res_start = res->start - win->offset; + + /* + * According to the RZ/G3S HW manual (Rev.1.10, section 34.3.1.75 PCIe + * Window Base (Lower) Registers) the window base address need to be 4K + * aligned. + */ + res_start = ALIGN(res_start, SZ_4K); + + size = ALIGN(size, SZ_4K); + size = roundup_pow_of_two(size) - 1; + + /* Set PCIe destination */ + writel_relaxed(upper_32_bits(res_start), + host->axi + RZG3S_PCI_PDESTU(id)); + writel_relaxed(lower_32_bits(res_start), + host->axi + RZG3S_PCI_PDESTL(id)); + + /* Set PCIe window mask */ + writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_PWMASKU(id)); + writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_PWMASKL(id)); + + /* Set PCIe window base and enable the window */ + writel_relaxed(upper_32_bits(res_start), + host->axi + RZG3S_PCI_PWBASEU(id)); + writel_relaxed(lower_32_bits(res_start) | RZG3S_PCI_PWBASEL_ENA, + host->axi + RZG3S_PCI_PWBASEL(id)); +} + +static int rzg3s_pcie_parse_map_ranges(struct rzg3s_pcie_host *host) +{ + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host); + struct resource_entry *win; + int i = 0; + + resource_list_for_each_entry(win, &bridge->windows) { + struct resource *res = win->res; + + if (i >= RZG3S_MAX_WINDOWS) + return dev_err_probe(host->dev, -ENOSPC, + "Failed to map outbound window for resource (%s)\n", + res->name); + + if (!res->flags) + continue; + + switch (resource_type(res)) { + case IORESOURCE_IO: + case IORESOURCE_MEM: + rzg3s_pcie_set_outbound_window(host, win, i); + i++; + break; + } + } + + return 0; +} + +static int rzg3s_soc_pcie_init_phy(struct rzg3s_pcie_host *host) +{ + static const u32 xcfgd_settings[RZG3S_PCI_PHY_XCFGD_NUM] = { + [8] = 0xe0006801, 0x007f7e30, 0x183e0000, 0x978ff500, + 0xec000000, 0x009f1400, 0x0000d009, + [17] = 0x78000000, + [19] = 0x00880000, 0x000005c0, 0x07000000, 0x00780920, + 0xc9400ce2, 0x90000c0c, 0x000c1414, 0x00005034, + 0x00006000, 0x00000001, + }; + static const u32 xcfga_cmn_settings[RZG3S_PCI_PHY_XCFGA_CMN_NUM] = { + 0x00000d10, 0x08310100, 0x00c21404, 0x013c0010, 0x01874440, + 0x1a216082, 0x00103440, 0x00000080, 0x00000010, 0x0c1000c1, + 0x1000c100, 0x0222000c, 0x00640019, 0x00a00028, 0x01d11228, + 0x0201001d, + }; + static const u32 xcfga_rx_settings[RZG3S_PCI_PHY_XCFGA_RX_NUM] = { + 0x07d55000, 0x030e3f00, 0x00000288, 0x102c5880, 0x0000000b, + 0x04141441, 0x00641641, 0x00d63d63, 0x00641641, 0x01970377, + 0x00190287, 0x00190028, 0x00000028, + }; + unsigned int i; + + /* + * Enable access permission for physical layer control and status + * registers. + */ + writel_relaxed(RZG3S_PCI_PERM_PIPE_PHY_REG_EN, + host->axi + RZG3S_PCI_PERM); + + for (i = 0; i < RZG3S_PCI_PHY_XCFGD_NUM; i++) { + writel_relaxed(xcfgd_settings[i], + host->axi + RZG3S_PCI_PHY_XCFGD(i)); + } + + for (i = 0; i < RZG3S_PCI_PHY_XCFGA_CMN_NUM; i++) { + writel_relaxed(xcfga_cmn_settings[i], + host->axi + RZG3S_PCI_PHY_XCFGA_CMN(i)); + } + + for (i = 0; i < RZG3S_PCI_PHY_XCFGA_RX_NUM; i++) { + writel_relaxed(xcfga_rx_settings[i], + host->axi + RZG3S_PCI_PHY_XCFGA_RX(i)); + } + + writel_relaxed(0x107, host->axi + RZG3S_PCI_PHY_XCFGA_TX); + + /* Select PHY settings values */ + writel_relaxed(RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL, + host->axi + RZG3S_PCI_PHY_XCFG_CTRL); + + /* + * Disable access permission for physical layer control and status + * registers. + */ + writel_relaxed(0, host->axi + RZG3S_PCI_PERM); + + return 0; +} + +static int +rzg3s_pcie_host_setup(struct rzg3s_pcie_host *host, + int (*init_irqdomain)(struct rzg3s_pcie_host *host), + void (*teardown_irqdomain)(struct rzg3s_pcie_host *host)) +{ + struct device *dev = host->dev; + int ret; + + /* Set inbound windows */ + ret = rzg3s_pcie_parse_map_dma_ranges(host); + if (ret) + return dev_err_probe(dev, ret, + "Failed to set inbound windows!\n"); + + /* Set outbound windows */ + ret = rzg3s_pcie_parse_map_ranges(host); + if (ret) + return dev_err_probe(dev, ret, + "Failed to set outbound windows!\n"); + + ret = init_irqdomain(host); + if (ret) + return dev_err_probe(dev, ret, "Failed to init IRQ domain\n"); + + ret = rzg3s_pcie_host_init(host); + if (ret) { + dev_err_probe(dev, ret, "Failed to initialize the HW!\n"); + goto teardown_irqdomain; + } + + ret = rzg3s_pcie_set_max_link_speed(host); + if (ret) + dev_info(dev, "Failed to set max link speed\n"); + + msleep(PCIE_RESET_CONFIG_WAIT_MS); + + return 0; + +teardown_irqdomain: + teardown_irqdomain(host); + + return ret; +} + +static int rzg3s_pcie_probe(struct platform_device *pdev) +{ + struct pci_host_bridge *bridge; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct device_node *sysc_np __free(device_node) = + of_parse_phandle(np, "renesas,sysc", 0); + struct rzg3s_pcie_host *host; + int ret; + + bridge = devm_pci_alloc_host_bridge(dev, sizeof(*host)); + if (!bridge) + return -ENOMEM; + + host = pci_host_bridge_priv(bridge); + host->dev = dev; + host->data = device_get_match_data(dev); + platform_set_drvdata(pdev, host); + + host->axi = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(host->axi)) + return PTR_ERR(host->axi); + host->pcie = host->axi + RZG3S_PCI_CFG_BASE; + + host->max_link_speed = of_pci_get_max_link_speed(np); + if (host->max_link_speed < 0) + host->max_link_speed = 2; + + ret = rzg3s_pcie_host_parse_port(host); + if (ret) + return ret; + + host->sysc = syscon_node_to_regmap(sysc_np); + if (IS_ERR(host->sysc)) { + ret = PTR_ERR(host->sysc); + goto port_refclk_put; + } + + ret = regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B, + RZG3S_SYS_PCIE_RST_RSM_B_MASK, + FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1)); + if (ret) + goto port_refclk_put; + + ret = rzg3s_pcie_resets_prepare_and_get(host); + if (ret) + goto sysc_signal_restore; + + ret = rzg3s_pcie_power_resets_deassert(host); + if (ret) + goto sysc_signal_restore; + + pm_runtime_enable(dev); + + /* + * Controller clocks are part of a clock power domain. Enable them + * through runtime PM. + */ + ret = pm_runtime_resume_and_get(dev); + if (ret) + goto rpm_disable; + + raw_spin_lock_init(&host->hw_lock); + + ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_init_irqdomain, + rzg3s_pcie_teardown_irqdomain); + if (ret) + goto rpm_put; + + bridge->sysdata = host; + bridge->ops = &rzg3s_pcie_root_ops; + bridge->child_ops = &rzg3s_pcie_child_ops; + ret = pci_host_probe(bridge); + if (ret) + goto host_probe_teardown; + + return 0; + +host_probe_teardown: + rzg3s_pcie_teardown_irqdomain(host); + reset_control_bulk_deassert(host->data->num_cfg_resets, + host->cfg_resets); +rpm_put: + pm_runtime_put_sync(dev); +rpm_disable: + pm_runtime_disable(dev); + reset_control_bulk_assert(host->data->num_power_resets, + host->power_resets); +sysc_signal_restore: + /* + * SYSC RST_RSM_B signal need to be asserted before turning off the + * power to the PHY. + */ + regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B, + RZG3S_SYS_PCIE_RST_RSM_B_MASK, + FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0)); +port_refclk_put: + clk_put(host->port.refclk); + + return ret; +} + +static int rzg3s_pcie_suspend_noirq(struct device *dev) +{ + struct rzg3s_pcie_host *host = dev_get_drvdata(dev); + const struct rzg3s_pcie_soc_data *data = host->data; + struct rzg3s_pcie_port *port = &host->port; + struct regmap *sysc = host->sysc; + int ret; + + ret = pm_runtime_put_sync(dev); + if (ret) + return ret; + + clk_disable_unprepare(port->refclk); + + ret = reset_control_bulk_assert(data->num_power_resets, + host->power_resets); + if (ret) + goto refclk_restore; + + ret = reset_control_bulk_assert(data->num_cfg_resets, + host->cfg_resets); + if (ret) + goto power_resets_restore; + + ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B, + RZG3S_SYS_PCIE_RST_RSM_B_MASK, + FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0)); + if (ret) + goto cfg_resets_restore; + + return 0; + + /* Restore the previous state if any error happens */ +cfg_resets_restore: + reset_control_bulk_deassert(data->num_cfg_resets, + host->cfg_resets); +power_resets_restore: + reset_control_bulk_deassert(data->num_power_resets, + host->power_resets); +refclk_restore: + clk_prepare_enable(port->refclk); + pm_runtime_resume_and_get(dev); + return ret; +} + +static int rzg3s_pcie_resume_noirq(struct device *dev) +{ + struct rzg3s_pcie_host *host = dev_get_drvdata(dev); + const struct rzg3s_pcie_soc_data *data = host->data; + struct regmap *sysc = host->sysc; + int ret; + + ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B, + RZG3S_SYS_PCIE_RST_RSM_B_MASK, + FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1)); + if (ret) + return ret; + + ret = rzg3s_pcie_power_resets_deassert(host); + if (ret) + goto assert_rst_rsm_b; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + goto assert_power_resets; + + ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_msi_hw_setup, + rzg3s_pcie_msi_hw_teardown); + if (ret) + goto rpm_put; + + return 0; + + /* + * If any error happens there is no way to recover the IP. Put it in the + * lowest possible power state. + */ +rpm_put: + pm_runtime_put_sync(dev); +assert_power_resets: + reset_control_bulk_assert(data->num_power_resets, + host->power_resets); +assert_rst_rsm_b: + regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B, + RZG3S_SYS_PCIE_RST_RSM_B_MASK, + FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0)); + return ret; +} + +static const struct dev_pm_ops rzg3s_pcie_pm_ops = { + NOIRQ_SYSTEM_SLEEP_PM_OPS(rzg3s_pcie_suspend_noirq, + rzg3s_pcie_resume_noirq) +}; + +static const char * const rzg3s_soc_power_resets[] = { + "aresetn", "rst_cfg_b", "rst_load_b", +}; + +static const char * const rzg3s_soc_cfg_resets[] = { + "rst_b", "rst_ps_b", "rst_gp_b", "rst_rsm_b", +}; + +static const struct rzg3s_pcie_soc_data rzg3s_soc_data = { + .power_resets = rzg3s_soc_power_resets, + .num_power_resets = ARRAY_SIZE(rzg3s_soc_power_resets), + .cfg_resets = rzg3s_soc_cfg_resets, + .num_cfg_resets = ARRAY_SIZE(rzg3s_soc_cfg_resets), + .init_phy = rzg3s_soc_pcie_init_phy, +}; + +static const struct of_device_id rzg3s_pcie_of_match[] = { + { + .compatible = "renesas,r9a08g045-pcie", + .data = &rzg3s_soc_data, + }, + {} +}; + +static struct platform_driver rzg3s_pcie_driver = { + .driver = { + .name = "rzg3s-pcie-host", + .of_match_table = rzg3s_pcie_of_match, + .pm = pm_ptr(&rzg3s_pcie_pm_ops), + .suppress_bind_attrs = true, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .probe = rzg3s_pcie_probe, +}; +builtin_platform_driver(rzg3s_pcie_driver); diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index b4b62b9ccc45..ec6afc38e898 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -578,22 +578,6 @@ static void vmd_detach_resources(struct vmd_dev *vmd) vmd->dev->resource[VMD_MEMBAR2].child = NULL; } -/* - * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. - * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower - * 16 bits are the PCI Segment Group (domain) number. Other bits are - * currently reserved. - */ -static int vmd_find_free_domain(void) -{ - int domain = 0xffff; - struct pci_bus *bus = NULL; - - while ((bus = pci_find_next_bus(bus)) != NULL) - domain = max_t(int, domain, pci_domain_nr(bus)); - return domain + 1; -} - static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint, resource_size_t *offset1, resource_size_t *offset2) @@ -878,13 +862,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) .parent = res, }; - sd->vmd_dev = vmd->dev; - sd->domain = vmd_find_free_domain(); - if (sd->domain < 0) - return sd->domain; - - sd->node = pcibus_to_node(vmd->dev->bus); - /* * Currently MSI remapping must be enabled in guest passthrough mode * due to some missing interrupt remapping plumbing. This is probably @@ -910,9 +887,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]); + sd->vmd_dev = vmd->dev; + + /* + * Emulated domains start at 0x10000 to not clash with ACPI _SEG + * domains. Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of + * which the lower 16 bits are the PCI Segment Group (domain) number. + * Other bits are currently reserved. + */ + sd->domain = pci_bus_find_emul_domain_nr(0, 0x10000, INT_MAX); + if (sd->domain < 0) + return sd->domain; + + sd->node = pcibus_to_node(vmd->dev->bus); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start, &vmd_ops, sd, &resources); if (!vmd->bus) { + pci_bus_release_emul_domain_nr(sd->domain); pci_free_resource_list(&resources); vmd_remove_irq_domain(vmd); return -ENODEV; @@ -1005,6 +997,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENOMEM; vmd->dev = dev; + vmd->sysdata.domain = PCI_DOMAIN_NR_NOT_SET; vmd->instance = ida_alloc(&vmd_instance_ida, GFP_KERNEL); if (vmd->instance < 0) return vmd->instance; @@ -1070,6 +1063,7 @@ static void vmd_remove(struct pci_dev *dev) vmd_detach_resources(vmd); vmd_remove_irq_domain(vmd); ida_free(&vmd_instance_ida, vmd->instance); + pci_bus_release_emul_domain_nr(vmd->sysdata.domain); } static void vmd_shutdown(struct pci_dev *dev) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index 0686f7ecbe91..debd235253c5 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -729,8 +729,9 @@ static void pci_epf_test_enable_doorbell(struct pci_epf_test *epf_test, if (bar < BAR_0) goto err_doorbell_cleanup; - ret = request_irq(epf->db_msg[0].virq, pci_epf_test_doorbell_handler, 0, - "pci-ep-test-doorbell", epf_test); + ret = request_threaded_irq(epf->db_msg[0].virq, NULL, + pci_epf_test_doorbell_handler, IRQF_ONESHOT, + "pci-ep-test-doorbell", epf_test); if (ret) { dev_err(&epf->dev, "Failed to request doorbell IRQ: %d\n", diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index 83e9ab10f9c4..3ecc5059f92b 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -36,11 +36,13 @@ * PCIe Root Port PCI EP */ +#include <linux/atomic.h> #include <linux/delay.h> #include <linux/io.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/pci-ep-msi.h> #include <linux/pci-epc.h> #include <linux/pci-epf.h> #include <linux/ntb.h> @@ -126,12 +128,13 @@ struct epf_ntb { u32 db_count; u32 spad_count; u64 mws_size[MAX_MW]; - u64 db; + atomic64_t db; u32 vbus_number; u16 vntb_pid; u16 vntb_vid; bool linkup; + bool msi_doorbell; u32 spad_size; enum pci_barno epf_ntb_bar[VNTB_BAR_NUM]; @@ -258,9 +261,9 @@ static void epf_ntb_cmd_handler(struct work_struct *work) ntb = container_of(work, struct epf_ntb, cmd_handler.work); - for (i = 1; i < ntb->db_count; i++) { + for (i = 1; i < ntb->db_count && !ntb->msi_doorbell; i++) { if (ntb->epf_db[i]) { - ntb->db |= 1 << (i - 1); + atomic64_or(1 << (i - 1), &ntb->db); ntb_db_event(&ntb->ntb, i); ntb->epf_db[i] = 0; } @@ -319,7 +322,21 @@ static void epf_ntb_cmd_handler(struct work_struct *work) reset_handler: queue_delayed_work(kpcintb_workqueue, &ntb->cmd_handler, - msecs_to_jiffies(5)); + ntb->msi_doorbell ? msecs_to_jiffies(500) : msecs_to_jiffies(5)); +} + +static irqreturn_t epf_ntb_doorbell_handler(int irq, void *data) +{ + struct epf_ntb *ntb = data; + int i; + + for (i = 1; i < ntb->db_count; i++) + if (irq == ntb->epf->db_msg[i].virq) { + atomic64_or(1 << (i - 1), &ntb->db); + ntb_db_event(&ntb->ntb, i); + } + + return IRQ_HANDLED; } /** @@ -500,6 +517,94 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb) return 0; } +static int epf_ntb_db_bar_init_msi_doorbell(struct epf_ntb *ntb, + struct pci_epf_bar *db_bar, + const struct pci_epc_features *epc_features, + enum pci_barno barno) +{ + struct pci_epf *epf = ntb->epf; + dma_addr_t low, high; + struct msi_msg *msg; + size_t sz; + int ret; + int i; + + ret = pci_epf_alloc_doorbell(epf, ntb->db_count); + if (ret) + return ret; + + for (i = 0; i < ntb->db_count; i++) { + ret = request_irq(epf->db_msg[i].virq, epf_ntb_doorbell_handler, + 0, "pci_epf_vntb_db", ntb); + + if (ret) { + dev_err(&epf->dev, + "Failed to request doorbell IRQ: %d\n", + epf->db_msg[i].virq); + goto err_free_irq; + } + } + + msg = &epf->db_msg[0].msg; + + high = 0; + low = (u64)msg->address_hi << 32 | msg->address_lo; + + for (i = 0; i < ntb->db_count; i++) { + struct msi_msg *msg = &epf->db_msg[i].msg; + dma_addr_t addr = (u64)msg->address_hi << 32 | msg->address_lo; + + low = min(low, addr); + high = max(high, addr); + } + + sz = high - low + sizeof(u32); + + ret = pci_epf_assign_bar_space(epf, sz, barno, epc_features, 0, low); + if (ret) { + dev_err(&epf->dev, "Failed to assign Doorbell BAR space\n"); + goto err_free_irq; + } + + ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, + ntb->epf->vfunc_no, db_bar); + if (ret) { + dev_err(&epf->dev, "Failed to set Doorbell BAR\n"); + goto err_free_irq; + } + + for (i = 0; i < ntb->db_count; i++) { + struct msi_msg *msg = &epf->db_msg[i].msg; + dma_addr_t addr; + size_t offset; + + ret = pci_epf_align_inbound_addr(epf, db_bar->barno, + ((u64)msg->address_hi << 32) | msg->address_lo, + &addr, &offset); + + if (ret) { + ntb->msi_doorbell = false; + goto err_free_irq; + } + + ntb->reg->db_data[i] = msg->data; + ntb->reg->db_offset[i] = offset; + } + + ntb->reg->db_entry_size = 0; + + ntb->msi_doorbell = true; + + return 0; + +err_free_irq: + for (i--; i >= 0; i--) + free_irq(epf->db_msg[i].virq, ntb); + + pci_epf_free_doorbell(ntb->epf); + return ret; +} + /** * epf_ntb_db_bar_init() - Configure Doorbell window BARs * @ntb: NTB device that facilitates communication between HOST and VHOST @@ -520,21 +625,25 @@ static int epf_ntb_db_bar_init(struct epf_ntb *ntb) ntb->epf->func_no, ntb->epf->vfunc_no); barno = ntb->epf_ntb_bar[BAR_DB]; - - mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0); - if (!mw_addr) { - dev_err(dev, "Failed to allocate OB address\n"); - return -ENOMEM; - } - - ntb->epf_db = mw_addr; - epf_bar = &ntb->epf->bar[barno]; - ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no, epf_bar); + ret = epf_ntb_db_bar_init_msi_doorbell(ntb, epf_bar, epc_features, barno); if (ret) { - dev_err(dev, "Doorbell BAR set failed\n"); + /* fall back to polling mode */ + mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0); + if (!mw_addr) { + dev_err(dev, "Failed to allocate OB address\n"); + return -ENOMEM; + } + + ntb->epf_db = mw_addr; + + ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, + ntb->epf->vfunc_no, epf_bar); + if (ret) { + dev_err(dev, "Doorbell BAR set failed\n"); goto err_alloc_peer_mem; + } } return ret; @@ -554,6 +663,16 @@ static void epf_ntb_db_bar_clear(struct epf_ntb *ntb) { enum pci_barno barno; + if (ntb->msi_doorbell) { + int i; + + for (i = 0; i < ntb->db_count; i++) + free_irq(ntb->epf->db_msg[i].virq, ntb); + } + + if (ntb->epf->db_msg) + pci_epf_free_doorbell(ntb->epf); + barno = ntb->epf_ntb_bar[BAR_DB]; pci_epf_free_space(ntb->epf, ntb->epf_db, barno, 0); pci_epc_clear_bar(ntb->epf->epc, @@ -1268,7 +1387,7 @@ static u64 vntb_epf_db_read(struct ntb_dev *ndev) { struct epf_ntb *ntb = ntb_ndev(ndev); - return ntb->db; + return atomic64_read(&ntb->db); } static int vntb_epf_mw_get_align(struct ntb_dev *ndev, int pidx, int idx, @@ -1308,7 +1427,7 @@ static int vntb_epf_db_clear(struct ntb_dev *ndev, u64 db_bits) { struct epf_ntb *ntb = ntb_ndev(ndev); - ntb->db &= ~db_bits; + atomic64_and(~db_bits, &ntb->db); return 0; } diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index d54e18872aef..9a505c796370 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -208,6 +208,48 @@ void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf) } EXPORT_SYMBOL_GPL(pci_epf_remove_vepf); +static int pci_epf_get_required_bar_size(struct pci_epf *epf, size_t *bar_size, + size_t *aligned_mem_size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type) +{ + u64 bar_fixed_size = epc_features->bar[bar].fixed_size; + size_t align = epc_features->align; + size_t size = *bar_size; + + if (size < 128) + size = 128; + + /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */ + if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M) + size = SZ_1M; + + if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) { + if (size > bar_fixed_size) { + dev_err(&epf->dev, + "requested BAR size is larger than fixed size\n"); + return -ENOMEM; + } + size = bar_fixed_size; + } else { + /* BAR size must be power of two */ + size = roundup_pow_of_two(size); + } + + *bar_size = size; + + /* + * The EPC's BAR start address must meet alignment requirements. In most + * cases, the alignment will match the BAR size. However, differences + * can occurâfor example, when the fixed BAR size (e.g., 128 bytes) is + * smaller than the required alignment (e.g., 4 KB). + */ + *aligned_mem_size = align ? ALIGN(size, align) : size; + + return 0; +} + /** * pci_epf_free_space() - free the allocated PCI EPF register space * @epf: the EPF device from whom to free the memory @@ -236,13 +278,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, } dev = epc->dev.parent; - dma_free_coherent(dev, epf_bar[bar].aligned_size, addr, + dma_free_coherent(dev, epf_bar[bar].mem_size, addr, epf_bar[bar].phys_addr); epf_bar[bar].phys_addr = 0; epf_bar[bar].addr = NULL; epf_bar[bar].size = 0; - epf_bar[bar].aligned_size = 0; + epf_bar[bar].mem_size = 0; epf_bar[bar].barno = 0; epf_bar[bar].flags = 0; } @@ -264,40 +306,16 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, const struct pci_epc_features *epc_features, enum pci_epc_interface_type type) { - u64 bar_fixed_size = epc_features->bar[bar].fixed_size; - size_t aligned_size, align = epc_features->align; struct pci_epf_bar *epf_bar; dma_addr_t phys_addr; struct pci_epc *epc; struct device *dev; + size_t mem_size; void *space; - if (size < 128) - size = 128; - - /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */ - if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M) - size = SZ_1M; - - if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) { - if (size > bar_fixed_size) { - dev_err(&epf->dev, - "requested BAR size is larger than fixed size\n"); - return NULL; - } - size = bar_fixed_size; - } else { - /* BAR size must be power of two */ - size = roundup_pow_of_two(size); - } - - /* - * Allocate enough memory to accommodate the iATU alignment - * requirement. In most cases, this will be the same as .size but - * it might be different if, for example, the fixed size of a BAR - * is smaller than align. - */ - aligned_size = align ? ALIGN(size, align) : size; + if (pci_epf_get_required_bar_size(epf, &size, &mem_size, bar, + epc_features, type)) + return NULL; if (type == PRIMARY_INTERFACE) { epc = epf->epc; @@ -308,7 +326,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } dev = epc->dev.parent; - space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL); + space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL); if (!space) { dev_err(dev, "failed to allocate mem space\n"); return NULL; @@ -317,7 +335,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, epf_bar[bar].phys_addr = phys_addr; epf_bar[bar].addr = space; epf_bar[bar].size = size; - epf_bar[bar].aligned_size = aligned_size; + epf_bar[bar].mem_size = mem_size; epf_bar[bar].barno = bar; if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; @@ -328,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } EXPORT_SYMBOL_GPL(pci_epf_alloc_space); +/** + * pci_epf_assign_bar_space() - Assign PCI EPF BAR space + * @epf: EPF device to assign the BAR memory + * @size: Size of the memory that has to be assigned + * @bar: BAR number for which the memory is assigned + * @epc_features: Features provided by the EPC specific to this EPF + * @type: Identifies if the assignment is for primary EPC or secondary EPC + * @bar_addr: Address to be assigned for the @bar + * + * Invoke to assign memory for the PCI EPF BAR. + * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR + * can only be a 64-bit BAR, or if the requested size is larger than 2 GB. + */ +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr) +{ + size_t bar_size, aligned_mem_size; + struct pci_epf_bar *epf_bar; + dma_addr_t limit; + int pos; + + if (!size) + return -EINVAL; + + limit = bar_addr + size - 1; + + /* + * Bits: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + * bar_addr: U U U U U U 0 X X X X X X X X X + * limit: U U U U U U 1 X X X X X X X X X + * + * bar_addr^limit 0 0 0 0 0 0 1 X X X X X X X X X + * + * U: unchanged address bits in range [bar_addr, limit] + * X: bit 0 or 1 + * + * (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB + * (pos). And value of (2 ^ pos) should be able to cover the BAR range. + */ + for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--) + if ((limit ^ bar_addr) & BIT_ULL(pos)) + break; + + if (pos == 8 * sizeof(dma_addr_t) - 1) + return -EINVAL; + + bar_size = BIT_ULL(pos + 1); + if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size, + bar, epc_features, type)) + return -ENOMEM; + + if (type == PRIMARY_INTERFACE) + epf_bar = epf->bar; + else + epf_bar = epf->sec_epc_bar; + + epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size); + + if (epf_bar[bar].phys_addr + bar_size < limit) + return -ENOMEM; + + epf_bar[bar].addr = NULL; + epf_bar[bar].size = bar_size; + epf_bar[bar].mem_size = aligned_mem_size; + epf_bar[bar].barno = bar; + if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; + else + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space); + static void pci_epf_remove_cfs(struct pci_epf_driver *driver) { struct config_group *group, *tmp; diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index afa50b446567..be5ef6516cff 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev) kobject_get(&bridge->kobj); return bridge; } +EXPORT_SYMBOL_GPL(pci_get_host_bridge_device); void pci_put_host_bridge_device(struct device *dev) { diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 77dee43b7858..00784a60ba80 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -158,8 +158,7 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) return dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)]; } -void pci_iov_resource_set_size(struct pci_dev *dev, int resno, - resource_size_t size) +void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size) { if (!pci_resource_is_iov(resno)) { pci_warn(dev, "%s is not an IOV resource\n", @@ -167,7 +166,8 @@ void pci_iov_resource_set_size(struct pci_dev *dev, int resno, return; } - dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)] = size; + resno = pci_resource_num_to_vf_bar(resno); + dev->sriov->barsz[resno] = pci_rebar_size_to_bytes(size); } bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev) @@ -1339,29 +1339,16 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple); */ int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size) { - u32 sizes; - int ret; - if (!pci_resource_is_iov(resno)) return -EINVAL; if (pci_iov_is_memory_decoding_enabled(dev)) return -EBUSY; - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) + if (!pci_rebar_size_supported(dev, resno, size)) return -EINVAL; - ret = pci_rebar_set_size(dev, resno, size); - if (ret) - return ret; - - pci_iov_resource_set_size(dev, resno, pci_rebar_size_to_bytes(size)); - - return 0; + return pci_rebar_set_size(dev, resno, size); } EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size); @@ -1380,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size); u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs) { u64 vf_len = pci_resource_len(dev, resno); - u32 sizes; + u64 sizes; if (!num_vfs) return 0; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 78e108e47254..981a76b6b7c0 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,12 +25,12 @@ struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct p2pdma_provider mem[PCI_STD_NUM_BARS]; }; struct pci_p2pdma_pagemap { - struct pci_dev *provider; - u64 bus_offset; struct dev_pagemap pgmap; + struct p2pdma_provider *mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -204,8 +204,8 @@ static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ - struct pci_p2pdma *p2pdma = - rcu_dereference_protected(pgmap->provider->p2pdma, 1); + struct pci_p2pdma *p2pdma = rcu_dereference_protected( + to_pci_dev(pgmap->mem->owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -228,56 +228,136 @@ static void pci_p2pdma_release(void *data) /* Flush and disable pci_alloc_p2p_mem() */ pdev->p2pdma = NULL; - synchronize_rcu(); + if (p2pdma->pool) + synchronize_rcu(); + xa_destroy(&p2pdma->map_types); + + if (!p2pdma->pool) + return; gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); - xa_destroy(&p2pdma->map_types); } -static int pci_p2pdma_setup(struct pci_dev *pdev) +/** + * pcim_p2pdma_init - Initialise peer-to-peer DMA providers + * @pdev: The PCI device to enable P2PDMA for + * + * This function initializes the peer-to-peer DMA infrastructure + * for a PCI device. It allocates and sets up the necessary data + * structures to support P2PDMA operations, including mapping type + * tracking. + */ +int pcim_p2pdma_init(struct pci_dev *pdev) { - int error = -ENOMEM; struct pci_p2pdma *p2p; + int i, ret; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2p) + return 0; p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); if (!p2p) return -ENOMEM; xa_init(&p2p->map_types); + /* + * Iterate over all standard PCI BARs and record only those that + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O + * port BARs) since they cannot be used for peer-to-peer (P2P) + * transactions. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); - if (!p2p->pool) - goto out; + p2p->mem[i].owner = &pdev->dev; + p2p->mem[i].bus_offset = + pci_bus_address(pdev, i) - pci_resource_start(pdev, i); + } - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); - if (error) - goto out_pool_destroy; + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (ret) + goto out_p2p; - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); - if (error) + rcu_assign_pointer(pdev->p2pdma, p2p); + return 0; + +out_p2p: + devm_kfree(&pdev->dev, p2p); + return ret; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_init); + +/** + * pcim_p2pdma_provider - Get peer-to-peer DMA provider + * @pdev: The PCI device to enable P2PDMA for + * @bar: BAR index to get provider + * + * This function gets peer-to-peer DMA provider for a PCI device. The lifetime + * of the provider (and of course the MMIO) is bound to the lifetime of the + * driver. A driver calling this function must ensure that all references to the + * provider, and any DMA mappings created for any MMIO, are all cleaned up + * before the driver remove() completes. + * + * Since P2P is almost always shared with a second driver this means some system + * to notify, invalidate and revoke the MMIO's DMA must be in place to use this + * function. For example a revoke can be built using DMABUF. + */ +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar) +{ + struct pci_p2pdma *p2p; + + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + return NULL; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (WARN_ON(!p2p)) + /* Someone forgot to call to pcim_p2pdma_init() before */ + return NULL; + + return &p2p->mem[bar]; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_provider); + +static int pci_p2pdma_setup_pool(struct pci_dev *pdev) +{ + struct pci_p2pdma *p2pdma; + int ret; + + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2pdma->pool) + /* We already setup pools, do nothing, */ + return 0; + + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); + if (!p2pdma->pool) + return -ENOMEM; + + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); + if (ret) goto out_pool_destroy; - rcu_assign_pointer(pdev->p2pdma, p2p); return 0; out_pool_destroy: - gen_pool_destroy(p2p->pool); -out: - devm_kfree(&pdev->dev, p2p); - return error; + gen_pool_destroy(p2pdma->pool); + p2pdma->pool = NULL; + return ret; } static void pci_p2pdma_unmap_mappings(void *data) { - struct pci_dev *pdev = data; + struct pci_p2pdma_pagemap *p2p_pgmap = data; /* * Removing the alloc attribute from sysfs will call * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj, + &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { struct pci_p2pdma_pagemap *p2p_pgmap; + struct p2pdma_provider *mem; struct dev_pagemap *pgmap; struct pci_p2pdma *p2pdma; void *addr; @@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (size + offset > pci_resource_len(pdev, bar)) return -EINVAL; - if (!pdev->p2pdma) { - error = pci_p2pdma_setup(pdev); - if (error) - return error; - } + error = pcim_p2pdma_init(pdev); + if (error) + return error; + + error = pci_p2pdma_setup_pool(pdev); + if (error) + return error; + + mem = pcim_p2pdma_provider(pdev, bar); + /* + * We checked validity of BAR prior to call + * to pcim_p2pdma_provider. It should never return NULL. + */ + if (WARN_ON(!mem)) + return -EINVAL; p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); if (!p2p_pgmap) @@ -328,10 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - - p2p_pgmap->provider = pdev; - p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - - pci_resource_start(pdev, bar); + p2p_pgmap->mem = mem; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -340,7 +428,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, } error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, - pdev); + p2p_pgmap); if (error) goto pages_free; @@ -972,16 +1060,26 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, - struct device *dev) +/** + * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers + * @provider: P2PDMA provider structure + * @dev: Target device for the transfer + * + * Determines how peer-to-peer DMA transfers should be mapped between + * the provider and the target device. The mapping type indicates whether + * the transfer can be done directly through PCI switches or must go + * through the host bridge. + */ +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; - struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; + struct pci_dev *pdev = to_pci_dev(provider->owner); struct pci_dev *client; struct pci_p2pdma *p2pdma; int dist; - if (!provider->p2pdma) + if (!pdev->p2pdma) return PCI_P2PDMA_MAP_NOT_SUPPORTED; if (!dev_is_pci(dev)) @@ -990,7 +1088,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, client = to_pci_dev(dev); rcu_read_lock(); - p2pdma = rcu_dereference(provider->p2pdma); + p2pdma = rcu_dereference(pdev->p2pdma); if (p2pdma) type = xa_to_value(xa_load(&p2pdma->map_types, @@ -998,7 +1096,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, rcu_read_unlock(); if (type == PCI_P2PDMA_MAP_UNKNOWN) - return calc_map_type_and_dist(provider, client, &dist, true); + return calc_map_type_and_dist(pdev, client, &dist, true); return type; } @@ -1006,9 +1104,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { - state->pgmap = page_pgmap(page); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; + struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); + + if (state->mem == p2p_pgmap->mem) + return; + + state->mem = p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev); } /** diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 302d61783f6c..7c2d9d596258 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -629,6 +629,8 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state) struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; + pci_dev->state_saved = false; + if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; int error; @@ -1036,6 +1038,8 @@ static int pci_pm_freeze(struct device *dev) if (!pm) { pci_pm_default_suspend(pci_dev); + if (!pm_runtime_suspended(dev)) + pci_dev->state_saved = false; return 0; } @@ -1129,8 +1133,6 @@ static int pci_pm_thaw(struct device *dev) pci_pm_reenable_device(pci_dev); } - pci_dev->state_saved = false; - return error; } diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 3881359440b1..80a7c4fe6b03 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf) pci_config_pm_runtime_get(pdev); ret = sysfs_emit(buf, "%016llx\n", - (u64)pci_rebar_get_possible_sizes(pdev, n)); + pci_rebar_get_possible_sizes(pdev, n)); pci_config_pm_runtime_put(pdev); @@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n, { struct pci_dev *pdev = to_pci_dev(dev); struct pci_bus *bus = pdev->bus; - struct resource *b_win, *res; unsigned long size; - int ret, i; + int ret; u16 cmd; if (kstrtoul(buf, 0, &size) < 0) return -EINVAL; - b_win = pbus_select_window(bus, pci_resource_n(pdev, n)); - if (!b_win) - return -EINVAL; - device_lock(dev); if (dev->driver || pci_num_vf(pdev)) { ret = -EBUSY; @@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n, pci_remove_resource_files(pdev); - pci_dev_for_each_resource(pdev, res, i) { - if (i >= PCI_BRIDGE_RESOURCES) - break; - - if (b_win == pbus_select_window(bus, res)) - pci_release_resource(pdev, i); - } - - ret = pci_resize_resource(pdev, n, size); + ret = pci_resize_resource(pdev, n, size, 0); pci_assign_unassigned_bus_resources(bus); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b14dd064006c..13dbb405dc31 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1823,41 +1823,12 @@ static void pci_restore_config_space(struct pci_dev *pdev) } } -static void pci_restore_rebar_state(struct pci_dev *pdev) -{ - unsigned int pos, nbars, i; - u32 ctrl; - - pos = pdev->rebar_cap; - if (!pos) - return; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); - - for (i = 0; i < nbars; i++, pos += 8) { - struct resource *res; - int bar_idx, size; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; - res = pci_resource_n(pdev, bar_idx); - size = pci_rebar_bytes_to_size(resource_size(res)); - ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; - ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); - pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); - } -} - /** * pci_restore_state - Restore the saved state of a PCI device * @dev: PCI device that we're dealing with */ void pci_restore_state(struct pci_dev *dev) { - if (!dev->state_saved) - return; - pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); pci_restore_pri_state(dev); @@ -3687,125 +3658,6 @@ void pci_acs_init(struct pci_dev *dev) pci_enable_acs(dev); } -void pci_rebar_init(struct pci_dev *pdev) -{ - pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); -} - -/** - * pci_rebar_find_pos - find position of resize ctrl reg for BAR - * @pdev: PCI device - * @bar: BAR to find - * - * Helper to find the position of the ctrl register for a BAR. - * Returns -ENOTSUPP if resizable BARs are not supported at all. - * Returns -ENOENT if no ctrl register for the BAR could be found. - */ -static int pci_rebar_find_pos(struct pci_dev *pdev, int bar) -{ - unsigned int pos, nbars, i; - u32 ctrl; - - if (pci_resource_is_iov(bar)) { - pos = pci_iov_vf_rebar_cap(pdev); - bar = pci_resource_num_to_vf_bar(bar); - } else { - pos = pdev->rebar_cap; - } - - if (!pos) - return -ENOTSUPP; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); - - for (i = 0; i < nbars; i++, pos += 8) { - int bar_idx; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl); - if (bar_idx == bar) - return pos; - } - - return -ENOENT; -} - -/** - * pci_rebar_get_possible_sizes - get possible sizes for BAR - * @pdev: PCI device - * @bar: BAR to query - * - * Get the possible sizes of a resizable BAR as bitmask defined in the spec - * (bit 0=1MB, bit 31=128TB). Returns 0 if BAR isn't resizable. - */ -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) -{ - int pos; - u32 cap; - - pos = pci_rebar_find_pos(pdev, bar); - if (pos < 0) - return 0; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap); - cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap); - - /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */ - if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f && - bar == 0 && cap == 0x700) - return 0x3f00; - - return cap; -} -EXPORT_SYMBOL(pci_rebar_get_possible_sizes); - -/** - * pci_rebar_get_current_size - get the current size of a BAR - * @pdev: PCI device - * @bar: BAR to set size to - * - * Read the size of a BAR from the resizable BAR config. - * Returns size if found or negative error code. - */ -int pci_rebar_get_current_size(struct pci_dev *pdev, int bar) -{ - int pos; - u32 ctrl; - - pos = pci_rebar_find_pos(pdev, bar); - if (pos < 0) - return pos; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl); -} - -/** - * pci_rebar_set_size - set a new size for a BAR - * @pdev: PCI device - * @bar: BAR to set size to - * @size: new size as defined in the spec (0=1MB, 31=128TB) - * - * Set the new size of a BAR as defined in the spec. - * Returns zero if resizing was successful, error code otherwise. - */ -int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size) -{ - int pos; - u32 ctrl; - - pos = pci_rebar_find_pos(pdev, bar); - if (pos < 0) - return pos; - - pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); - ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; - ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); - pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); - return 0; -} - /** * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port * @dev: the PCI device @@ -6656,9 +6508,31 @@ static void pci_no_domains(void) #endif } +#ifdef CONFIG_PCI_DOMAINS +static DEFINE_IDA(pci_domain_nr_dynamic_ida); + +/** + * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints + * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable + * @min: minimum allowable domain + * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned + */ +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr); + +void pci_bus_release_emul_domain_nr(int domain_nr) +{ + ida_free(&pci_domain_nr_dynamic_ida, domain_nr); +} +EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr); +#endif + #ifdef CONFIG_PCI_DOMAINS_GENERIC static DEFINE_IDA(pci_domain_nr_static_ida); -static DEFINE_IDA(pci_domain_nr_dynamic_ida); static void of_pci_reserve_static_domain_nr(void) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 36f8c0985430..a33bc4e0bf34 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -421,8 +421,10 @@ enum pci_bar_type { struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size); +int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size, + int exclude_bars); unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); @@ -808,8 +810,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); -void pci_iov_resource_set_size(struct pci_dev *dev, int resno, - resource_size_t size); +void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size); bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev); static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev) { @@ -851,7 +852,7 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) return 0; } static inline void pci_iov_resource_set_size(struct pci_dev *dev, int resno, - resource_size_t size) { } + int size) { } static inline bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev) { return false; @@ -1022,12 +1023,9 @@ static inline int acpi_get_rc_resources(struct device *dev, const char *hid, #endif void pci_rebar_init(struct pci_dev *pdev); +void pci_restore_rebar_state(struct pci_dev *pdev); int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); -static inline u64 pci_rebar_size_to_bytes(int size) -{ - return 1ULL << (size + 20); -} struct device_node; diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index d1b68c18444f..38a41ccf79b9 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -760,7 +760,6 @@ static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) device_for_each_child(&dev->dev, &off, pcie_port_device_iter); pci_restore_state(dev); - pci_save_state(dev); return PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 65e4b008be00..ed0f9691e7d1 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev) dev->ptm_granularity = 0; } + if (cap & PCI_PTM_CAP_RES) + dev->ptm_responder = 1; + if (cap & PCI_PTM_CAP_REQ) + dev->ptm_requester = 1; + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) pci_enable_ptm(dev, NULL); @@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev) return -EINVAL; } + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_ROOT_PORT: + if (!dev->ptm_root) + return -EINVAL; + break; + case PCI_EXP_TYPE_UPSTREAM: + if (!dev->ptm_responder) + return -EINVAL; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + if (!dev->ptm_requester) + return -EINVAL; + break; + default: + return -EINVAL; + } + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); ctrl |= PCI_PTM_CTRL_ENABLE; diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 9cd032dff31e..124d2d309c58 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -650,6 +650,11 @@ static void pci_release_host_bridge_dev(struct device *dev) pci_free_resource_list(&bridge->windows); pci_free_resource_list(&bridge->dma_ranges); + + /* Host bridges only have domain_nr set in the emulation case */ + if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_emul_domain_nr(bridge->domain_nr); + kfree(bridge); } @@ -1130,7 +1135,8 @@ unregister: device_del(&bridge->dev); free: #ifdef CONFIG_PCI_DOMAINS_GENERIC - pci_bus_release_domain_nr(parent, bus->domain_nr); + if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_domain_nr(parent, bus->domain_nr); #endif if (bus_registered) put_device(&bus->dev); @@ -2747,8 +2753,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) pci_reassigndev_resource_alignment(dev); - dev->state_saved = false; - pci_init_capabilities(dev); /* @@ -3170,8 +3174,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus, * bus number if there is room. */ if (bus->self && bus->self->is_hotplug_bridge) { - used_buses = max_t(unsigned int, available_buses, - pci_hotplug_bus_size - 1); + used_buses = max(available_buses, pci_hotplug_bus_size - 1); if (max - start < used_buses) { max = start + used_buses; diff --git a/drivers/pci/pwrctrl/Kconfig b/drivers/pci/pwrctrl/Kconfig index 6956c1854811..e0f999f299bb 100644 --- a/drivers/pci/pwrctrl/Kconfig +++ b/drivers/pci/pwrctrl/Kconfig @@ -22,6 +22,21 @@ config PCI_PWRCTRL_SLOT PCI slots. The voltage regulators powering the rails of the PCI slots are expected to be defined in the devicetree node of the PCI bridge. +config PCI_PWRCTRL_TC9563 + tristate "PCI Power Control driver for TC9563 PCIe switch" + select PCI_PWRCTRL + default m if ARCH_QCOM + depends on I2C + help + Say Y here to enable the PCI Power Control driver of TC9563 PCIe + switch. + + This driver enables power and configures the TC9563 PCIe switch + through i2c. TC9563 is a PCIe switch which has one upstream and three + downstream ports. To one of the downstream ports integrated ethernet + MAC is connected as endpoint device. Other two downstream ports are + supposed to connect to external device. + # deprecated config HAVE_PWRCTL bool diff --git a/drivers/pci/pwrctrl/Makefile b/drivers/pci/pwrctrl/Makefile index a4e5808d7850..13b02282106c 100644 --- a/drivers/pci/pwrctrl/Makefile +++ b/drivers/pci/pwrctrl/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_PCI_PWRCTRL_PWRSEQ) += pci-pwrctrl-pwrseq.o obj-$(CONFIG_PCI_PWRCTRL_SLOT) += pci-pwrctrl-slot.o pci-pwrctrl-slot-y := slot.o + +obj-$(CONFIG_PCI_PWRCTRL_TC9563) += pci-pwrctrl-tc9563.o diff --git a/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c new file mode 100644 index 000000000000..ec423432ac65 --- /dev/null +++ b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#include <linux/array_size.h> +#include <linux/bitfield.h> +#include <linux/bits.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/gpio/consumer.h> +#include <linux/i2c.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_platform.h> +#include <linux/pci.h> +#include <linux/pci-pwrctrl.h> +#include <linux/platform_device.h> +#include <linux/regulator/consumer.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/unaligned.h> + +#include "../pci.h" + +#define TC9563_GPIO_CONFIG 0x801208 +#define TC9563_RESET_GPIO 0x801210 + +#define TC9563_PORT_L0S_DELAY 0x82496c +#define TC9563_PORT_L1_DELAY 0x824970 + +#define TC9563_EMBEDDED_ETH_DELAY 0x8200d8 +#define TC9563_ETH_L1_DELAY_MASK GENMASK(27, 18) +#define TC9563_ETH_L1_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L1_DELAY_MASK, x) +#define TC9563_ETH_L0S_DELAY_MASK GENMASK(17, 13) +#define TC9563_ETH_L0S_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L0S_DELAY_MASK, x) + +#define TC9563_NFTS_2_5_GT 0x824978 +#define TC9563_NFTS_5_GT 0x82497c + +#define TC9563_PORT_LANE_ACCESS_ENABLE 0x828000 + +#define TC9563_PHY_RATE_CHANGE_OVERRIDE 0x828040 +#define TC9563_PHY_RATE_CHANGE 0x828050 + +#define TC9563_TX_MARGIN 0x828234 + +#define TC9563_DFE_ENABLE 0x828a04 +#define TC9563_DFE_EQ0_MODE 0x828a08 +#define TC9563_DFE_EQ1_MODE 0x828a0c +#define TC9563_DFE_EQ2_MODE 0x828a14 +#define TC9563_DFE_PD_MASK 0x828254 + +#define TC9563_PORT_SELECT 0x82c02c +#define TC9563_PORT_ACCESS_ENABLE 0x82c030 + +#define TC9563_POWER_CONTROL 0x82b09c +#define TC9563_POWER_CONTROL_OVREN 0x82b2c8 + +#define TC9563_GPIO_MASK 0xfffffff3 +#define TC9563_GPIO_DEASSERT_BITS 0xc /* Bits to clear for GPIO deassert */ + +#define TC9563_TX_MARGIN_MIN_UA 400000 + +/* + * From TC9563 PORSYS rev 0.2, figure 1.1 POR boot sequence + * wait for 10ms for the internal osc frequency to stabilize. + */ +#define TC9563_OSC_STAB_DELAY_US (10 * USEC_PER_MSEC) + +#define TC9563_L0S_L1_DELAY_UNIT_NS 256 /* Each unit represents 256 nanoseconds */ + +struct tc9563_pwrctrl_reg_setting { + unsigned int offset; + unsigned int val; +}; + +enum tc9563_pwrctrl_ports { + TC9563_USP, + TC9563_DSP1, + TC9563_DSP2, + TC9563_DSP3, + TC9563_ETHERNET, + TC9563_MAX +}; + +struct tc9563_pwrctrl_cfg { + u32 l0s_delay; + u32 l1_delay; + u32 tx_amp; + u8 nfts[2]; /* GEN1 & GEN2 */ + bool disable_dfe; + bool disable_port; +}; + +#define TC9563_PWRCTL_MAX_SUPPLY 6 + +static const char *const tc9563_supply_names[TC9563_PWRCTL_MAX_SUPPLY] = { + "vddc", + "vdd18", + "vdd09", + "vddio1", + "vddio2", + "vddio18", +}; + +struct tc9563_pwrctrl_ctx { + struct regulator_bulk_data supplies[TC9563_PWRCTL_MAX_SUPPLY]; + struct tc9563_pwrctrl_cfg cfg[TC9563_MAX]; + struct gpio_desc *reset_gpio; + struct i2c_adapter *adapter; + struct i2c_client *client; + struct pci_pwrctrl pwrctrl; +}; + +/* + * downstream port power off sequence, hardcoding the address + * as we don't know register names for these register offsets. + */ +static const struct tc9563_pwrctrl_reg_setting common_pwroff_seq[] = { + {0x82900c, 0x1}, + {0x829010, 0x1}, + {0x829018, 0x0}, + {0x829020, 0x1}, + {0x82902c, 0x1}, + {0x829030, 0x1}, + {0x82903c, 0x1}, + {0x829058, 0x0}, + {0x82905c, 0x1}, + {0x829060, 0x1}, + {0x8290cc, 0x1}, + {0x8290d0, 0x1}, + {0x8290d8, 0x1}, + {0x8290e0, 0x1}, + {0x8290e8, 0x1}, + {0x8290ec, 0x1}, + {0x8290f4, 0x1}, + {0x82910c, 0x1}, + {0x829110, 0x1}, + {0x829114, 0x1}, +}; + +static const struct tc9563_pwrctrl_reg_setting dsp1_pwroff_seq[] = { + {TC9563_PORT_ACCESS_ENABLE, 0x2}, + {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3}, + {TC9563_POWER_CONTROL, 0x014f4804}, + {TC9563_POWER_CONTROL_OVREN, 0x1}, + {TC9563_PORT_ACCESS_ENABLE, 0x4}, +}; + +static const struct tc9563_pwrctrl_reg_setting dsp2_pwroff_seq[] = { + {TC9563_PORT_ACCESS_ENABLE, 0x8}, + {TC9563_PORT_LANE_ACCESS_ENABLE, 0x1}, + {TC9563_POWER_CONTROL, 0x014f4804}, + {TC9563_POWER_CONTROL_OVREN, 0x1}, + {TC9563_PORT_ACCESS_ENABLE, 0x8}, +}; + +/* + * Since all transfers are initiated by the probe, no locks are necessary, + * as there are no concurrent calls. + */ +static int tc9563_pwrctrl_i2c_write(struct i2c_client *client, + u32 reg_addr, u32 reg_val) +{ + struct i2c_msg msg; + u8 msg_buf[7]; + int ret; + + msg.addr = client->addr; + msg.len = 7; + msg.flags = 0; + + /* Big Endian for reg addr */ + put_unaligned_be24(reg_addr, &msg_buf[0]); + + /* Little Endian for reg val */ + put_unaligned_le32(reg_val, &msg_buf[3]); + + msg.buf = msg_buf; + ret = i2c_transfer(client->adapter, &msg, 1); + return ret == 1 ? 0 : ret; +} + +static int tc9563_pwrctrl_i2c_read(struct i2c_client *client, + u32 reg_addr, u32 *reg_val) +{ + struct i2c_msg msg[2]; + u8 wr_data[3]; + u32 rd_data; + int ret; + + msg[0].addr = client->addr; + msg[0].len = 3; + msg[0].flags = 0; + + /* Big Endian for reg addr */ + put_unaligned_be24(reg_addr, &wr_data[0]); + + msg[0].buf = wr_data; + + msg[1].addr = client->addr; + msg[1].len = 4; + msg[1].flags = I2C_M_RD; + + msg[1].buf = (u8 *)&rd_data; + + ret = i2c_transfer(client->adapter, &msg[0], 2); + if (ret == 2) { + *reg_val = get_unaligned_le32(&rd_data); + return 0; + } + + /* If only one message successfully completed, return -EIO */ + return ret == 1 ? -EIO : ret; +} + +static int tc9563_pwrctrl_i2c_bulk_write(struct i2c_client *client, + const struct tc9563_pwrctrl_reg_setting *seq, int len) +{ + int ret, i; + + for (i = 0; i < len; i++) { + ret = tc9563_pwrctrl_i2c_write(client, seq[i].offset, seq[i].val); + if (ret) + return ret; + } + + return 0; +} + +static int tc9563_pwrctrl_disable_port(struct tc9563_pwrctrl_ctx *ctx, + enum tc9563_pwrctrl_ports port) +{ + struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port]; + const struct tc9563_pwrctrl_reg_setting *seq; + int ret, len; + + if (!cfg->disable_port) + return 0; + + if (port == TC9563_DSP1) { + seq = dsp1_pwroff_seq; + len = ARRAY_SIZE(dsp1_pwroff_seq); + } else { + seq = dsp2_pwroff_seq; + len = ARRAY_SIZE(dsp2_pwroff_seq); + } + + ret = tc9563_pwrctrl_i2c_bulk_write(ctx->client, seq, len); + if (ret) + return ret; + + return tc9563_pwrctrl_i2c_bulk_write(ctx->client, + common_pwroff_seq, ARRAY_SIZE(common_pwroff_seq)); +} + +static int tc9563_pwrctrl_set_l0s_l1_entry_delay(struct tc9563_pwrctrl_ctx *ctx, + enum tc9563_pwrctrl_ports port, bool is_l1, u32 ns) +{ + u32 rd_val, units; + int ret; + + if (ns < TC9563_L0S_L1_DELAY_UNIT_NS) + return 0; + + /* convert to units of 256ns */ + units = ns / TC9563_L0S_L1_DELAY_UNIT_NS; + + if (port == TC9563_ETHERNET) { + ret = tc9563_pwrctrl_i2c_read(ctx->client, TC9563_EMBEDDED_ETH_DELAY, &rd_val); + if (ret) + return ret; + + if (is_l1) + rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L1_DELAY_MASK); + else + rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L0S_DELAY_MASK); + + return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_EMBEDDED_ETH_DELAY, rd_val); + } + + ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port)); + if (ret) + return ret; + + return tc9563_pwrctrl_i2c_write(ctx->client, + is_l1 ? TC9563_PORT_L1_DELAY : TC9563_PORT_L0S_DELAY, units); +} + +static int tc9563_pwrctrl_set_tx_amplitude(struct tc9563_pwrctrl_ctx *ctx, + enum tc9563_pwrctrl_ports port) +{ + u32 amp = ctx->cfg[port].tx_amp; + int port_access; + + if (amp < TC9563_TX_MARGIN_MIN_UA) + return 0; + + /* txmargin = (Amp(uV) - 400000) / 3125 */ + amp = (amp - TC9563_TX_MARGIN_MIN_UA) / 3125; + + switch (port) { + case TC9563_USP: + port_access = 0x1; + break; + case TC9563_DSP1: + port_access = 0x2; + break; + case TC9563_DSP2: + port_access = 0x8; + break; + default: + return -EINVAL; + } + + struct tc9563_pwrctrl_reg_setting tx_amp_seq[] = { + {TC9563_PORT_ACCESS_ENABLE, port_access}, + {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3}, + {TC9563_TX_MARGIN, amp}, + }; + + return tc9563_pwrctrl_i2c_bulk_write(ctx->client, tx_amp_seq, ARRAY_SIZE(tx_amp_seq)); +} + +static int tc9563_pwrctrl_disable_dfe(struct tc9563_pwrctrl_ctx *ctx, + enum tc9563_pwrctrl_ports port) +{ + struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port]; + int port_access, lane_access = 0x3; + u32 phy_rate = 0x21; + + if (!cfg->disable_dfe) + return 0; + + switch (port) { + case TC9563_USP: + phy_rate = 0x1; + port_access = 0x1; + break; + case TC9563_DSP1: + port_access = 0x2; + break; + case TC9563_DSP2: + port_access = 0x8; + lane_access = 0x1; + break; + default: + return -EINVAL; + } + + struct tc9563_pwrctrl_reg_setting disable_dfe_seq[] = { + {TC9563_PORT_ACCESS_ENABLE, port_access}, + {TC9563_PORT_LANE_ACCESS_ENABLE, lane_access}, + {TC9563_DFE_ENABLE, 0x0}, + {TC9563_DFE_EQ0_MODE, 0x411}, + {TC9563_DFE_EQ1_MODE, 0x11}, + {TC9563_DFE_EQ2_MODE, 0x11}, + {TC9563_DFE_PD_MASK, 0x7}, + {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x10}, + {TC9563_PHY_RATE_CHANGE, phy_rate}, + {TC9563_PHY_RATE_CHANGE, 0x0}, + {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x0}, + }; + + return tc9563_pwrctrl_i2c_bulk_write(ctx->client, + disable_dfe_seq, ARRAY_SIZE(disable_dfe_seq)); +} + +static int tc9563_pwrctrl_set_nfts(struct tc9563_pwrctrl_ctx *ctx, + enum tc9563_pwrctrl_ports port) +{ + u8 *nfts = ctx->cfg[port].nfts; + struct tc9563_pwrctrl_reg_setting nfts_seq[] = { + {TC9563_NFTS_2_5_GT, nfts[0]}, + {TC9563_NFTS_5_GT, nfts[1]}, + }; + int ret; + + if (!nfts[0]) + return 0; + + ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port)); + if (ret) + return ret; + + return tc9563_pwrctrl_i2c_bulk_write(ctx->client, nfts_seq, ARRAY_SIZE(nfts_seq)); +} + +static int tc9563_pwrctrl_assert_deassert_reset(struct tc9563_pwrctrl_ctx *ctx, bool deassert) +{ + int ret, val; + + ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_GPIO_CONFIG, TC9563_GPIO_MASK); + if (ret) + return ret; + + val = deassert ? TC9563_GPIO_DEASSERT_BITS : 0; + + return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_RESET_GPIO, val); +} + +static int tc9563_pwrctrl_parse_device_dt(struct tc9563_pwrctrl_ctx *ctx, struct device_node *node, + enum tc9563_pwrctrl_ports port) +{ + struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port]; + int ret; + + /* Disable port if the status of the port is disabled. */ + if (!of_device_is_available(node)) { + cfg->disable_port = true; + return 0; + } + + ret = of_property_read_u32(node, "aspm-l0s-entry-delay-ns", &cfg->l0s_delay); + if (ret && ret != -EINVAL) + return ret; + + ret = of_property_read_u32(node, "aspm-l1-entry-delay-ns", &cfg->l1_delay); + if (ret && ret != -EINVAL) + return ret; + + ret = of_property_read_u32(node, "toshiba,tx-amplitude-microvolt", &cfg->tx_amp); + if (ret && ret != -EINVAL) + return ret; + + ret = of_property_read_u8_array(node, "n-fts", cfg->nfts, ARRAY_SIZE(cfg->nfts)); + if (ret && ret != -EINVAL) + return ret; + + cfg->disable_dfe = of_property_read_bool(node, "toshiba,no-dfe-support"); + + return 0; +} + +static void tc9563_pwrctrl_power_off(struct tc9563_pwrctrl_ctx *ctx) +{ + gpiod_set_value(ctx->reset_gpio, 1); + + regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies); +} + +static int tc9563_pwrctrl_bring_up(struct tc9563_pwrctrl_ctx *ctx) +{ + struct tc9563_pwrctrl_cfg *cfg; + int ret, i; + + ret = regulator_bulk_enable(ARRAY_SIZE(ctx->supplies), ctx->supplies); + if (ret < 0) + return dev_err_probe(ctx->pwrctrl.dev, ret, "cannot enable regulators\n"); + + gpiod_set_value(ctx->reset_gpio, 0); + + fsleep(TC9563_OSC_STAB_DELAY_US); + + ret = tc9563_pwrctrl_assert_deassert_reset(ctx, false); + if (ret) + goto power_off; + + for (i = 0; i < TC9563_MAX; i++) { + cfg = &ctx->cfg[i]; + ret = tc9563_pwrctrl_disable_port(ctx, i); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Disabling port failed\n"); + goto power_off; + } + + ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, false, cfg->l0s_delay); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Setting L0s entry delay failed\n"); + goto power_off; + } + + ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, true, cfg->l1_delay); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Setting L1 entry delay failed\n"); + goto power_off; + } + + ret = tc9563_pwrctrl_set_tx_amplitude(ctx, i); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Setting Tx amplitude failed\n"); + goto power_off; + } + + ret = tc9563_pwrctrl_set_nfts(ctx, i); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Setting N_FTS failed\n"); + goto power_off; + } + + ret = tc9563_pwrctrl_disable_dfe(ctx, i); + if (ret) { + dev_err(ctx->pwrctrl.dev, "Disabling DFE failed\n"); + goto power_off; + } + } + + ret = tc9563_pwrctrl_assert_deassert_reset(ctx, true); + if (!ret) + return 0; + +power_off: + tc9563_pwrctrl_power_off(ctx); + return ret; +} + +static int tc9563_pwrctrl_probe(struct platform_device *pdev) +{ + struct pci_host_bridge *bridge = to_pci_host_bridge(pdev->dev.parent); + struct pci_bus *bus = bridge->bus; + struct device *dev = &pdev->dev; + enum tc9563_pwrctrl_ports port; + struct tc9563_pwrctrl_ctx *ctx; + struct device_node *i2c_node; + int ret, addr; + + ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ret = of_property_read_u32_index(pdev->dev.of_node, "i2c-parent", 1, &addr); + if (ret) + return dev_err_probe(dev, ret, "Failed to read i2c-parent property\n"); + + i2c_node = of_parse_phandle(dev->of_node, "i2c-parent", 0); + ctx->adapter = of_find_i2c_adapter_by_node(i2c_node); + of_node_put(i2c_node); + if (!ctx->adapter) + return dev_err_probe(dev, -EPROBE_DEFER, "Failed to find I2C adapter\n"); + + ctx->client = i2c_new_dummy_device(ctx->adapter, addr); + if (IS_ERR(ctx->client)) { + dev_err(dev, "Failed to create I2C client\n"); + i2c_put_adapter(ctx->adapter); + return PTR_ERR(ctx->client); + } + + for (int i = 0; i < ARRAY_SIZE(tc9563_supply_names); i++) + ctx->supplies[i].supply = tc9563_supply_names[i]; + + ret = devm_regulator_bulk_get(dev, TC9563_PWRCTL_MAX_SUPPLY, ctx->supplies); + if (ret) { + dev_err_probe(dev, ret, "failed to get supply regulator\n"); + goto remove_i2c; + } + + ctx->reset_gpio = devm_gpiod_get(dev, "resx", GPIOD_OUT_HIGH); + if (IS_ERR(ctx->reset_gpio)) { + ret = dev_err_probe(dev, PTR_ERR(ctx->reset_gpio), "failed to get resx GPIO\n"); + goto remove_i2c; + } + + pci_pwrctrl_init(&ctx->pwrctrl, dev); + + port = TC9563_USP; + ret = tc9563_pwrctrl_parse_device_dt(ctx, pdev->dev.of_node, port); + if (ret) { + dev_err(dev, "failed to parse device tree properties: %d\n", ret); + goto remove_i2c; + } + + /* + * Downstream ports are always children of the upstream port. + * The first node represents DSP1, the second node represents DSP2, and so on. + */ + for_each_child_of_node_scoped(pdev->dev.of_node, child) { + port++; + ret = tc9563_pwrctrl_parse_device_dt(ctx, child, port); + if (ret) + break; + /* Embedded ethernet device are under DSP3 */ + if (port == TC9563_DSP3) { + for_each_child_of_node_scoped(child, child1) { + port++; + ret = tc9563_pwrctrl_parse_device_dt(ctx, child1, port); + if (ret) + break; + } + } + } + if (ret) { + dev_err(dev, "failed to parse device tree properties: %d\n", ret); + goto remove_i2c; + } + + if (bridge->ops->assert_perst) { + ret = bridge->ops->assert_perst(bus, true); + if (ret) + goto remove_i2c; + } + + ret = tc9563_pwrctrl_bring_up(ctx); + if (ret) + goto remove_i2c; + + if (bridge->ops->assert_perst) { + ret = bridge->ops->assert_perst(bus, false); + if (ret) + goto power_off; + } + + ret = devm_pci_pwrctrl_device_set_ready(dev, &ctx->pwrctrl); + if (ret) + goto power_off; + + platform_set_drvdata(pdev, ctx); + + return 0; + +power_off: + tc9563_pwrctrl_power_off(ctx); +remove_i2c: + i2c_unregister_device(ctx->client); + i2c_put_adapter(ctx->adapter); + return ret; +} + +static void tc9563_pwrctrl_remove(struct platform_device *pdev) +{ + struct tc9563_pwrctrl_ctx *ctx = platform_get_drvdata(pdev); + + tc9563_pwrctrl_power_off(ctx); + i2c_unregister_device(ctx->client); + i2c_put_adapter(ctx->adapter); +} + +static const struct of_device_id tc9563_pwrctrl_of_match[] = { + { .compatible = "pci1179,0623"}, + { } +}; +MODULE_DEVICE_TABLE(of, tc9563_pwrctrl_of_match); + +static struct platform_driver tc9563_pwrctrl_driver = { + .driver = { + .name = "pwrctrl-tc9563", + .of_match_table = tc9563_pwrctrl_of_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .probe = tc9563_pwrctrl_probe, + .remove = tc9563_pwrctrl_remove, +}; +module_platform_driver(tc9563_pwrctrl_driver); + +MODULE_AUTHOR("Krishna chaitanya chundru <quic_krichai@quicinc.com>"); +MODULE_DESCRIPTION("TC956x power control driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c new file mode 100644 index 000000000000..ecdebdeb2dff --- /dev/null +++ b/drivers/pci/rebar.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCI Resizable BAR Extended Capability handling. + */ + +#include <linux/bits.h> +#include <linux/bitfield.h> +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/ioport.h> +#include <linux/log2.h> +#include <linux/pci.h> +#include <linux/sizes.h> +#include <linux/types.h> + +#include "pci.h" + +#define PCI_REBAR_MIN_SIZE ((resource_size_t)SZ_1M) + +/** + * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size + * @bytes: size in bytes + * + * Convert size in bytes to encoded BAR Size in Resizable BAR Capability + * (PCIe r6.2, sec. 7.8.6.3). + * + * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + */ +int pci_rebar_bytes_to_size(u64 bytes) +{ + int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE); + + bytes = roundup_pow_of_two(bytes); + + return max(ilog2(bytes), rebar_minsize) - rebar_minsize; +} +EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size); + +/** + * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes + * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: BAR size in bytes + */ +resource_size_t pci_rebar_size_to_bytes(int size) +{ + return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE)); +} +EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes); + +void pci_rebar_init(struct pci_dev *pdev) +{ + pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); +} + +/** + * pci_rebar_find_pos - find position of resize control reg for BAR + * @pdev: PCI device + * @bar: BAR to find + * + * Helper to find the position of the control register for a BAR. + * + * Return: + * * %-ENOTSUPP if resizable BARs are not supported at all, + * * %-ENOENT if no control register for the BAR could be found. + */ +static int pci_rebar_find_pos(struct pci_dev *pdev, int bar) +{ + unsigned int pos, nbars, i; + u32 ctrl; + + if (pci_resource_is_iov(bar)) { + pos = pci_iov_vf_rebar_cap(pdev); + bar = pci_resource_num_to_vf_bar(bar); + } else { + pos = pdev->rebar_cap; + } + + if (!pos) + return -ENOTSUPP; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); + + for (i = 0; i < nbars; i++, pos += 8) { + int bar_idx; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl); + if (bar_idx == bar) + return pos; + } + + return -ENOENT; +} + +/** + * pci_rebar_get_possible_sizes - get possible sizes for Resizable BAR + * @pdev: PCI device + * @bar: BAR to query + * + * Get the possible sizes of a resizable BAR as bitmask. + * + * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if + * BAR isn't resizable. + */ +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) +{ + int pos; + u32 cap; + + pos = pci_rebar_find_pos(pdev, bar); + if (pos < 0) + return 0; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap); + cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap); + + /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */ + if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f && + bar == 0 && cap == 0x700) + return 0x3f00; + + return cap; +} +EXPORT_SYMBOL(pci_rebar_get_possible_sizes); + +/** + * pci_rebar_size_supported - check if size is supported for BAR + * @pdev: PCI device + * @bar: BAR to check + * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: %true if @bar is resizable and @size is supported, otherwise + * %false. + */ +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size) +{ + u64 sizes = pci_rebar_get_possible_sizes(pdev, bar); + + if (size < 0 || size > ilog2(SZ_128T) - ilog2(PCI_REBAR_MIN_SIZE)) + return false; + + return BIT(size) & sizes; +} +EXPORT_SYMBOL_GPL(pci_rebar_size_supported); + +/** + * pci_rebar_get_max_size - get the maximum supported size of a BAR + * @pdev: PCI device + * @bar: BAR to query + * + * Get the largest supported size of a resizable BAR as a size. + * + * Return: the encoded maximum BAR size as defined in the PCIe spec + * (0=1MB, 31=128TB), or %-NOENT on error. + */ +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar) +{ + u64 sizes; + + sizes = pci_rebar_get_possible_sizes(pdev, bar); + if (!sizes) + return -ENOENT; + + return __fls(sizes); +} +EXPORT_SYMBOL_GPL(pci_rebar_get_max_size); + +/** + * pci_rebar_get_current_size - get the current size of a Resizable BAR + * @pdev: PCI device + * @bar: BAR to get the size from + * + * Read the current size of a BAR from the Resizable BAR config. + * + * Return: BAR Size if @bar is resizable (0=1MB, 31=128TB), or negative on + * error. + */ +int pci_rebar_get_current_size(struct pci_dev *pdev, int bar) +{ + int pos; + u32 ctrl; + + pos = pci_rebar_find_pos(pdev, bar); + if (pos < 0) + return pos; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl); +} + +/** + * pci_rebar_set_size - set a new size for a Resizable BAR + * @pdev: PCI device + * @bar: BAR to set size to + * @size: new size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Set the new size of a BAR as defined in the spec. + * + * Return: %0 if resizing was successful, or negative on error. + */ +int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size) +{ + int pos; + u32 ctrl; + + pos = pci_rebar_find_pos(pdev, bar); + if (pos < 0) + return pos; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; + ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); + pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); + + if (pci_resource_is_iov(bar)) + pci_iov_resource_set_size(pdev, bar, size); + + return 0; +} + +void pci_restore_rebar_state(struct pci_dev *pdev) +{ + unsigned int pos, nbars, i; + u32 ctrl; + + pos = pdev->rebar_cap; + if (!pos) + return; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl); + + for (i = 0; i < nbars; i++, pos += 8) { + struct resource *res; + int bar_idx, size; + + pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); + bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; + res = pci_resource_n(pdev, bar_idx); + size = pci_rebar_bytes_to_size(resource_size(res)); + ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; + ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size); + pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); + } +} + +static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev, + int resno) +{ + u16 cmd; + + if (pci_resource_is_iov(resno)) + return pci_iov_is_memory_decoding_enabled(dev); + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + + return cmd & PCI_COMMAND_MEMORY; +} + +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size) +{ + resource_size_t res_size = pci_rebar_size_to_bytes(size); + struct resource *res = pci_resource_n(dev, resno); + + if (pci_resource_is_iov(resno)) + res_size *= pci_sriov_get_totalvfs(dev); + + resource_set_size(res, res_size); +} + +/** + * pci_resize_resource - reconfigure a Resizable BAR and resources + * @dev: the PCI device + * @resno: index of the BAR to be resized + * @size: new size as defined in the spec (0=1MB, 31=128TB) + * @exclude_bars: a mask of BARs that should not be released + * + * Reconfigure @resno to @size and re-run resource assignment algorithm + * with the new size. + * + * Prior to resize, release @dev resources that share a bridge window with + * @resno. This unpins the bridge window resource to allow changing it. + * + * The caller may prevent releasing a particular BAR by providing + * @exclude_bars mask, but this may result in the resize operation failing + * due to insufficient space. + * + * Return: 0 on success, or negative on error. In case of an error, the + * resources are restored to their original places. + */ +int pci_resize_resource(struct pci_dev *dev, int resno, int size, + int exclude_bars) +{ + struct pci_host_bridge *host; + int old, ret; + + /* Check if we must preserve the firmware's resource assignment */ + host = pci_find_host_bridge(dev->bus); + if (host->preserve_config) + return -ENOTSUPP; + + if (pci_resize_is_memory_decoding_enabled(dev, resno)) + return -EBUSY; + + if (!pci_rebar_size_supported(dev, resno, size)) + return -EINVAL; + + old = pci_rebar_get_current_size(dev, resno); + if (old < 0) + return old; + + ret = pci_rebar_set_size(dev, resno, size); + if (ret) + return ret; + + ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars); + if (ret) + goto error_resize; + return 0; + +error_resize: + pci_rebar_set_size(dev, resno, old); + return ret; +} +EXPORT_SYMBOL(pci_resize_resource); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 3645f392a9fd..6e90f46f52af 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -15,6 +15,7 @@ */ #include <linux/bitops.h> +#include <linux/bug.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -135,6 +136,9 @@ static void restore_dev_resource(struct pci_dev_resource *dev_res) { struct resource *res = dev_res->res; + if (WARN_ON_ONCE(res->parent)) + return; + res->start = dev_res->start; res->end = dev_res->end; res->flags = dev_res->flags; @@ -2420,18 +2424,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources); * release it when possible. If the bridge window contains assigned * resources, it cannot be released. */ -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) +static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res, + struct list_head *saved) { unsigned long type = res->flags; struct pci_dev_resource *dev_res; - struct pci_dev *bridge; - LIST_HEAD(saved); + struct pci_dev *bridge = NULL; LIST_HEAD(added); LIST_HEAD(failed); unsigned int i; - int ret; - - down_read(&pci_bus_sem); + int ret = 0; while (!pci_is_root_bus(bus)) { bridge = bus->self; @@ -2443,9 +2445,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) /* Ignore BARs which are still in use */ if (!res->child) { - ret = add_to_list(&saved, bridge, res, 0, 0); + ret = add_to_list(saved, bridge, res, 0, 0); if (ret) - goto cleanup; + return ret; pci_release_resource(bridge, i); } else { @@ -2459,10 +2461,8 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) bus = bus->parent; } - if (list_empty(&saved)) { - up_read(&pci_bus_sem); + if (!bridge) return -ENOENT; - } __pci_bus_size_bridges(bridge->subordinate, &added); __pci_bridge_assign_resources(bridge, &added, &failed); @@ -2470,49 +2470,107 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) free_list(&added); if (!list_empty(&failed)) { - if (pci_required_resource_failed(&failed, type)) { + if (pci_required_resource_failed(&failed, type)) ret = -ENOSPC; - goto cleanup; - } - /* Only resources with unrelated types failed (again) */ free_list(&failed); + if (ret) + return ret; + + /* Only resources with unrelated types failed (again) */ } - list_for_each_entry(dev_res, &saved, list) { + list_for_each_entry(dev_res, saved, list) { + struct pci_dev *dev = dev_res->dev; + /* Skip the bridge we just assigned resources for */ - if (bridge == dev_res->dev) + if (bridge == dev) + continue; + + if (!dev->subordinate) continue; - bridge = dev_res->dev; - pci_setup_bridge(bridge->subordinate); + pci_setup_bridge(dev->subordinate); } - free_list(&saved); - up_read(&pci_bus_sem); return 0; +} -cleanup: - /* Restore size and flags */ - list_for_each_entry(dev_res, &failed, list) - restore_dev_resource(dev_res); - free_list(&failed); +int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size, + int exclude_bars) +{ + struct resource *res = pci_resource_n(pdev, resno); + struct pci_dev_resource *dev_res; + struct pci_bus *bus = pdev->bus; + struct resource *b_win, *r; + LIST_HEAD(saved); + unsigned int i; + int ret = 0; + + b_win = pbus_select_window(bus, res); + if (!b_win) + return -EINVAL; + pci_dev_for_each_resource(pdev, r, i) { + if (i >= PCI_BRIDGE_RESOURCES) + break; + + if (exclude_bars & BIT(i)) + continue; + + if (b_win != pbus_select_window(bus, r)) + continue; + + ret = add_to_list(&saved, pdev, r, 0, 0); + if (ret) + goto restore; + pci_release_resource(pdev, i); + } + + pci_resize_resource_set_size(pdev, resno, size); + + if (!bus->self) + goto out; + + down_read(&pci_bus_sem); + ret = pbus_reassign_bridge_resources(bus, res, &saved); + if (ret) + goto restore; + +out: + up_read(&pci_bus_sem); + free_list(&saved); + return ret; + +restore: /* Revert to the old configuration */ list_for_each_entry(dev_res, &saved, list) { struct resource *res = dev_res->res; + struct pci_dev *dev = dev_res->dev; - bridge = dev_res->dev; - i = pci_resource_num(bridge, res); + i = pci_resource_num(dev, res); + + if (res->parent) { + release_child_resources(res); + pci_release_resource(dev, i); + } restore_dev_resource(dev_res); - pci_claim_resource(bridge, i); - pci_setup_bridge(bridge->subordinate); - } - free_list(&saved); - up_read(&pci_bus_sem); + ret = pci_claim_resource(dev, i); + if (ret) + continue; - return ret; + if (i < PCI_BRIDGE_RESOURCES) { + const char *res_name = pci_resource_name(dev, i); + + pci_update_resource(dev, i); + pci_info(dev, "%s %pR: old value restored\n", + res_name, res); + } + if (dev->subordinate) + pci_setup_bridge(dev->subordinate); + } + goto out; } void pci_assign_unassigned_bus_resources(struct pci_bus *bus) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index c3ba4ccecd43..e5fcadfc58b0 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -431,84 +431,6 @@ int pci_release_resource(struct pci_dev *dev, int resno) } EXPORT_SYMBOL(pci_release_resource); -static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev, - int resno) -{ - u16 cmd; - - if (pci_resource_is_iov(resno)) - return pci_iov_is_memory_decoding_enabled(dev); - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - - return cmd & PCI_COMMAND_MEMORY; -} - -static void pci_resize_resource_set_size(struct pci_dev *dev, int resno, - int size) -{ - resource_size_t res_size = pci_rebar_size_to_bytes(size); - struct resource *res = pci_resource_n(dev, resno); - - if (!pci_resource_is_iov(resno)) { - resource_set_size(res, res_size); - } else { - resource_set_size(res, res_size * pci_sriov_get_totalvfs(dev)); - pci_iov_resource_set_size(dev, resno, res_size); - } -} - -int pci_resize_resource(struct pci_dev *dev, int resno, int size) -{ - struct resource *res = pci_resource_n(dev, resno); - struct pci_host_bridge *host; - int old, ret; - u32 sizes; - - /* Check if we must preserve the firmware's resource assignment */ - host = pci_find_host_bridge(dev->bus); - if (host->preserve_config) - return -ENOTSUPP; - - /* Make sure the resource isn't assigned before resizing it. */ - if (!(res->flags & IORESOURCE_UNSET)) - return -EBUSY; - - if (pci_resize_is_memory_decoding_enabled(dev, resno)) - return -EBUSY; - - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) - return -EINVAL; - - old = pci_rebar_get_current_size(dev, resno); - if (old < 0) - return old; - - ret = pci_rebar_set_size(dev, resno, size); - if (ret) - return ret; - - pci_resize_resource_set_size(dev, resno, size); - - /* Check if the new config works by trying to assign everything. */ - if (dev->bus->self) { - ret = pbus_reassign_bridge_resources(dev->bus, res); - if (ret) - goto error_resize; - } - return 0; - -error_resize: - pci_rebar_set_size(dev, resno, old); - pci_resize_resource_set_size(dev, resno, old); - return ret; -} -EXPORT_SYMBOL(pci_resize_resource); - int pci_enable_resources(struct pci_dev *dev, int mask) { u16 cmd, old_cmd; diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig index 8248895ca903..f6c1bcbb57de 100644 --- a/drivers/power/reset/Kconfig +++ b/drivers/power/reset/Kconfig @@ -283,6 +283,15 @@ config POWER_RESET_KEYSTONE help Reboot support for the KEYSTONE SoCs. +config POWER_RESET_SPACEMIT_P1 + tristate "SpacemiT P1 poweroff and reset driver" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on MFD_SPACEMIT_P1 + default MFD_SPACEMIT_P1 + help + This driver supports power-off and reset operations for the SpacemiT + P1 PMIC. + config POWER_RESET_SYSCON bool "Generic SYSCON regmap reset driver" depends on OF diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile index 51da87e05ce7..0e4ae6f6b5c5 100644 --- a/drivers/power/reset/Makefile +++ b/drivers/power/reset/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_POWER_RESET_LTC2952) += ltc2952-poweroff.o obj-$(CONFIG_POWER_RESET_QNAP) += qnap-poweroff.o obj-$(CONFIG_POWER_RESET_REGULATOR) += regulator-poweroff.o obj-$(CONFIG_POWER_RESET_RESTART) += restart-poweroff.o +obj-$(CONFIG_POWER_RESET_SPACEMIT_P1) += spacemit-p1-reboot.o obj-$(CONFIG_POWER_RESET_ST) += st-poweroff.o obj-$(CONFIG_POWER_RESET_TH1520_AON) += th1520-aon-reboot.o obj-$(CONFIG_POWER_RESET_TORADEX_EC) += tdx-ec-poweroff.o diff --git a/drivers/power/reset/spacemit-p1-reboot.c b/drivers/power/reset/spacemit-p1-reboot.c new file mode 100644 index 000000000000..9ec3d1fff8f3 --- /dev/null +++ b/drivers/power/reset/spacemit-p1-reboot.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 by Aurelien Jarno + */ + +#include <linux/bits.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> +#include <linux/reboot.h> + +/* Power Control Register 2 */ +#define PWR_CTRL2 0x7e +#define PWR_CTRL2_SHUTDOWN BIT(2) /* Shutdown request */ +#define PWR_CTRL2_RST BIT(1) /* Reset request */ + +static int spacemit_p1_pwroff_handler(struct sys_off_data *data) +{ + struct regmap *regmap = data->cb_data; + int ret; + + /* Put the PMIC into shutdown state */ + ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_SHUTDOWN); + if (ret) { + dev_err(data->dev, "shutdown failed: %d\n", ret); + return notifier_from_errno(ret); + } + + return NOTIFY_DONE; +} + +static int spacemit_p1_restart_handler(struct sys_off_data *data) +{ + struct regmap *regmap = data->cb_data; + int ret; + + /* Put the PMIC into reset state */ + ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_RST); + if (ret) { + dev_err(data->dev, "restart failed: %d\n", ret); + return notifier_from_errno(ret); + } + + return NOTIFY_DONE; +} + +static int spacemit_p1_reboot_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct regmap *regmap; + int ret; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return -ENODEV; + + ret = devm_register_power_off_handler(dev, &spacemit_p1_pwroff_handler, + regmap); + if (ret) + return dev_err_probe(dev, ret, + "Failed to register power off handler\n"); + + ret = devm_register_restart_handler(dev, spacemit_p1_restart_handler, + regmap); + if (ret) + return dev_err_probe(dev, ret, + "Failed to register restart handler\n"); + + return 0; +} + +static const struct platform_device_id spacemit_p1_reboot_id_table[] = { + { "spacemit-p1-reboot", }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(platform, spacemit_p1_reboot_id_table); + +static struct platform_driver spacemit_p1_reboot_driver = { + .driver = { + .name = "spacemit-p1-reboot", + }, + .probe = spacemit_p1_reboot_probe, + .id_table = spacemit_p1_reboot_id_table, +}; +module_platform_driver(spacemit_p1_reboot_driver); + +MODULE_DESCRIPTION("SpacemiT P1 reboot/poweroff driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 03c8525b480f..92f9f7aae92f 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -942,6 +942,21 @@ config CHARGER_RT9471 This driver can also be built as a module. If so, the module will be called rt9471. +config CHARGER_RT9756 + tristate "Richtek RT9756 smart cap divider charger driver" + depends on I2C + select REGMAP_I2C + select LINEAR_RANGES + help + This adds support for Richtek RT9756 smart cap divider charger driver. + It's a high efficiency and high charge current charger. the device + integrates smart cap divider topology with 9-channel high speed + ADCs that can provide input and output voltage, current and + temperature monitoring. + + This driver can also be built as a module. If so, the module will be + called rt9756. + config CHARGER_CROS_USBPD tristate "ChromeOS EC based USBPD charger" depends on CROS_USBPD_NOTIFY @@ -1007,6 +1022,15 @@ config CHARGER_UCS1002 Say Y to enable support for Microchip UCS1002 Programmable USB Port Power Controller with Charger Emulation. +config CHARGER_BD71828 + tristate "Power-supply driver for ROHM BD71828 and BD71815 PMIC" + depends on MFD_ROHM_BD71828 + help + Say Y here to enable support for charger and battery + in ROHM BD71815, BD71817, ROHM BD71828 power management + ICs. This driver gets various bits of information about battery + and charger states. + config CHARGER_BD99954 tristate "ROHM bd99954 charger driver" depends on I2C diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index 6e37a3edf7e3..4b79d5abc49a 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_CHARGER_RT5033) += rt5033_charger.o obj-$(CONFIG_CHARGER_RT9455) += rt9455_charger.o obj-$(CONFIG_CHARGER_RT9467) += rt9467-charger.o obj-$(CONFIG_CHARGER_RT9471) += rt9471.o +obj-$(CONFIG_CHARGER_RT9756) += rt9756.o obj-$(CONFIG_BATTERY_TWL4030_MADC) += twl4030_madc_battery.o obj-$(CONFIG_CHARGER_88PM860X) += 88pm860x_charger.o obj-$(CONFIG_CHARGER_PF1550) += pf1550-charger.o @@ -117,6 +118,7 @@ obj-$(CONFIG_CHARGER_SC2731) += sc2731_charger.o obj-$(CONFIG_FUEL_GAUGE_SC27XX) += sc27xx_fuel_gauge.o obj-$(CONFIG_FUEL_GAUGE_STC3117) += stc3117_fuel_gauge.o obj-$(CONFIG_CHARGER_UCS1002) += ucs1002_power.o +obj-$(CONFIG_CHARGER_BD71828) += bd71828-power.o obj-$(CONFIG_CHARGER_BD99954) += bd99954-charger.o obj-$(CONFIG_CHARGER_WILCO) += wilco-charger.o obj-$(CONFIG_RN5T618_POWER) += rn5t618_power.o diff --git a/drivers/power/supply/apm_power.c b/drivers/power/supply/apm_power.c index 9236e0078578..9933cdc5c387 100644 --- a/drivers/power/supply/apm_power.c +++ b/drivers/power/supply/apm_power.c @@ -364,7 +364,8 @@ static int __init apm_battery_init(void) static void __exit apm_battery_exit(void) { - apm_get_power_status = NULL; + if (apm_get_power_status == apm_battery_apm_get_power_status) + apm_get_power_status = NULL; } module_init(apm_battery_init); diff --git a/drivers/power/supply/bd71828-power.c b/drivers/power/supply/bd71828-power.c new file mode 100644 index 000000000000..f667baedeb77 --- /dev/null +++ b/drivers/power/supply/bd71828-power.c @@ -0,0 +1,1049 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* ROHM BD71815, BD71828 and BD71878 Charger driver */ + +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/mfd/rohm-bd71815.h> +#include <linux/mfd/rohm-bd71828.h> +#include <linux/module.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/property.h> +#include <linux/power_supply.h> +#include <linux/slab.h> + +/* common defines */ +#define BD7182x_MASK_VBAT_U 0x1f +#define BD7182x_MASK_VDCIN_U 0x0f +#define BD7182x_MASK_IBAT_U 0x3f +#define BD7182x_MASK_CURDIR_DISCHG 0x80 +#define BD7182x_MASK_CHG_STATE 0x7f +#define BD7182x_MASK_BAT_TEMP 0x07 +#define BD7182x_MASK_DCIN_DET BIT(0) +#define BD7182x_MASK_CONF_PON BIT(0) +#define BD71815_MASK_CONF_XSTB BIT(1) +#define BD7182x_MASK_BAT_STAT 0x3f +#define BD7182x_MASK_DCIN_STAT 0x07 + +#define BD7182x_MASK_WDT_AUTO 0x40 +#define BD7182x_MASK_VBAT_ALM_LIMIT_U 0x01 +#define BD7182x_MASK_CHG_EN 0x01 + +#define BD7182x_DCIN_COLLAPSE_DEFAULT 0x36 + +#define MAX_CURRENT_DEFAULT 890000 /* uA */ +#define AC_NAME "bd71828_ac" +#define BAT_NAME "bd71828_bat" + +#define BAT_OPEN 0x7 + +/* + * VBAT Low voltage detection Threshold + * 0x00D4*16mV = 212*0.016 = 3.392v + */ +#define VBAT_LOW_TH 0x00D4 + +struct pwr_regs { + u8 vbat_avg; + u8 ibat; + u8 ibat_avg; + u8 btemp_vth; + u8 chg_state; + u8 bat_temp; + u8 dcin_stat; + u8 dcin_collapse_limit; + u8 chg_set1; + u8 chg_en; + u8 vbat_alm_limit_u; + u8 conf; + u8 vdcin; +}; + +static const struct pwr_regs pwr_regs_bd71828 = { + .vbat_avg = BD71828_REG_VBAT_U, + .ibat = BD71828_REG_IBAT_U, + .ibat_avg = BD71828_REG_IBAT_AVG_U, + .btemp_vth = BD71828_REG_VM_BTMP_U, + .chg_state = BD71828_REG_CHG_STATE, + .bat_temp = BD71828_REG_BAT_TEMP, + .dcin_stat = BD71828_REG_DCIN_STAT, + .dcin_collapse_limit = BD71828_REG_DCIN_CLPS, + .chg_set1 = BD71828_REG_CHG_SET1, + .chg_en = BD71828_REG_CHG_EN, + .vbat_alm_limit_u = BD71828_REG_ALM_VBAT_LIMIT_U, + .conf = BD71828_REG_CONF, + .vdcin = BD71828_REG_VDCIN_U, +}; + +static const struct pwr_regs pwr_regs_bd71815 = { + .vbat_avg = BD71815_REG_VM_SA_VBAT_U, + /* BD71815 does not have separate current and current avg */ + .ibat = BD71815_REG_CC_CURCD_U, + .ibat_avg = BD71815_REG_CC_CURCD_U, + + .btemp_vth = BD71815_REG_VM_BTMP, + .chg_state = BD71815_REG_CHG_STATE, + .bat_temp = BD71815_REG_BAT_TEMP, + .dcin_stat = BD71815_REG_DCIN_STAT, + .dcin_collapse_limit = BD71815_REG_DCIN_CLPS, + .chg_set1 = BD71815_REG_CHG_SET1, + .chg_en = BD71815_REG_CHG_SET1, + .vbat_alm_limit_u = BD71815_REG_ALM_VBAT_TH_U, + .conf = BD71815_REG_CONF, + + .vdcin = BD71815_REG_VM_DCIN_U, +}; + +struct bd71828_power { + struct regmap *regmap; + enum rohm_chip_type chip_type; + struct device *dev; + struct power_supply *ac; + struct power_supply *bat; + + const struct pwr_regs *regs; + /* Reg val to uA */ + int curr_factor; + int rsens; + int (*get_temp)(struct bd71828_power *pwr, int *temp); + int (*bat_inserted)(struct bd71828_power *pwr); +}; + +static int bd7182x_write16(struct bd71828_power *pwr, int reg, u16 val) +{ + __be16 tmp; + + tmp = cpu_to_be16(val); + + return regmap_bulk_write(pwr->regmap, reg, &tmp, sizeof(tmp)); +} + +static int bd7182x_read16_himask(struct bd71828_power *pwr, int reg, int himask, + u16 *val) +{ + struct regmap *regmap = pwr->regmap; + int ret; + __be16 rvals; + u8 *tmp = (u8 *)&rvals; + + ret = regmap_bulk_read(regmap, reg, &rvals, sizeof(*val)); + if (!ret) { + *tmp &= himask; + *val = be16_to_cpu(rvals); + } + + return ret; +} + +static int bd71828_get_vbat(struct bd71828_power *pwr, int *vcell) +{ + u16 tmp_vcell; + int ret; + + ret = bd7182x_read16_himask(pwr, pwr->regs->vbat_avg, + BD7182x_MASK_VBAT_U, &tmp_vcell); + if (ret) + dev_err(pwr->dev, "Failed to read battery average voltage\n"); + else + *vcell = ((int)tmp_vcell) * 1000; + + return ret; +} + +static int bd71828_get_current_ds_adc(struct bd71828_power *pwr, int *curr, int *curr_avg) +{ + __be16 tmp_curr; + char *tmp = (char *)&tmp_curr; + int dir = 1; + int regs[] = { pwr->regs->ibat, pwr->regs->ibat_avg }; + int *vals[] = { curr, curr_avg }; + int ret, i; + + for (dir = 1, i = 0; i < ARRAY_SIZE(regs); i++) { + ret = regmap_bulk_read(pwr->regmap, regs[i], &tmp_curr, + sizeof(tmp_curr)); + if (ret) + break; + + if (*tmp & BD7182x_MASK_CURDIR_DISCHG) + dir = -1; + + *tmp &= BD7182x_MASK_IBAT_U; + + *vals[i] = dir * ((int)be16_to_cpu(tmp_curr)) * pwr->curr_factor; + } + + return ret; +} + +/* Unit is tenths of degree C */ +static int bd71815_get_temp(struct bd71828_power *pwr, int *temp) +{ + struct regmap *regmap = pwr->regmap; + int ret; + int t; + + ret = regmap_read(regmap, pwr->regs->btemp_vth, &t); + if (ret) + return ret; + + t = 200 - t; + + if (t > 200) { + dev_err(pwr->dev, "Failed to read battery temperature\n"); + return -ENODATA; + } + + return 0; +} + +/* Unit is tenths of degree C */ +static int bd71828_get_temp(struct bd71828_power *pwr, int *temp) +{ + u16 t; + int ret; + int tmp = 200 * 10000; + + ret = bd7182x_read16_himask(pwr, pwr->regs->btemp_vth, + BD71828_MASK_VM_BTMP_U, &t); + if (ret) + return ret; + + if (t > 3200) { + dev_err(pwr->dev, + "Failed to read battery temperature\n"); + return -ENODATA; + } + + tmp -= 625ULL * (unsigned int)t; + *temp = tmp / 1000; + + return ret; +} + +static int bd71828_charge_status(struct bd71828_power *pwr, + int *s, int *h) +{ + unsigned int state; + int status, health; + int ret = 1; + + ret = regmap_read(pwr->regmap, pwr->regs->chg_state, &state); + if (ret) { + dev_err(pwr->dev, "charger status reading failed (%d)\n", ret); + return ret; + } + + state &= BD7182x_MASK_CHG_STATE; + + dev_dbg(pwr->dev, "CHG_STATE %d\n", state); + + switch (state) { + case 0x00: + status = POWER_SUPPLY_STATUS_DISCHARGING; + health = POWER_SUPPLY_HEALTH_GOOD; + break; + case 0x01: + case 0x02: + case 0x03: + case 0x0E: + status = POWER_SUPPLY_STATUS_CHARGING; + health = POWER_SUPPLY_HEALTH_GOOD; + break; + case 0x0F: + status = POWER_SUPPLY_STATUS_FULL; + health = POWER_SUPPLY_HEALTH_GOOD; + break; + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + status = POWER_SUPPLY_STATUS_NOT_CHARGING; + health = POWER_SUPPLY_HEALTH_OVERHEAT; + break; + case 0x30: + case 0x31: + case 0x32: + case 0x40: + status = POWER_SUPPLY_STATUS_DISCHARGING; + health = POWER_SUPPLY_HEALTH_GOOD; + break; + case 0x7f: + default: + status = POWER_SUPPLY_STATUS_NOT_CHARGING; + health = POWER_SUPPLY_HEALTH_DEAD; + break; + } + + if (s) + *s = status; + if (h) + *h = health; + + return ret; +} + +static int get_chg_online(struct bd71828_power *pwr, int *chg_online) +{ + int r, ret; + + ret = regmap_read(pwr->regmap, pwr->regs->dcin_stat, &r); + if (ret) { + dev_err(pwr->dev, "Failed to read DCIN status\n"); + return ret; + } + *chg_online = ((r & BD7182x_MASK_DCIN_DET) != 0); + + return 0; +} + +static int get_bat_online(struct bd71828_power *pwr, int *bat_online) +{ + int r, ret; + + ret = regmap_read(pwr->regmap, pwr->regs->bat_temp, &r); + if (ret) { + dev_err(pwr->dev, "Failed to read battery temperature\n"); + return ret; + } + *bat_online = ((r & BD7182x_MASK_BAT_TEMP) != BAT_OPEN); + + return 0; +} + +static int bd71828_bat_inserted(struct bd71828_power *pwr) +{ + int ret, val; + + ret = regmap_read(pwr->regmap, pwr->regs->conf, &val); + if (ret) { + dev_err(pwr->dev, "Failed to read CONF register\n"); + return 0; + } + ret = val & BD7182x_MASK_CONF_PON; + + if (ret) + regmap_update_bits(pwr->regmap, pwr->regs->conf, + BD7182x_MASK_CONF_PON, 0); + + return ret; +} + +static int bd71815_bat_inserted(struct bd71828_power *pwr) +{ + int ret, val; + + ret = regmap_read(pwr->regmap, pwr->regs->conf, &val); + if (ret) { + dev_err(pwr->dev, "Failed to read CONF register\n"); + return ret; + } + + ret = !(val & BD71815_MASK_CONF_XSTB); + if (ret) + regmap_write(pwr->regmap, pwr->regs->conf, val | + BD71815_MASK_CONF_XSTB); + + return ret; +} + +static int bd71828_init_hardware(struct bd71828_power *pwr) +{ + int ret; + + /* TODO: Collapse limit should come from device-tree ? */ + ret = regmap_write(pwr->regmap, pwr->regs->dcin_collapse_limit, + BD7182x_DCIN_COLLAPSE_DEFAULT); + if (ret) { + dev_err(pwr->dev, "Failed to write DCIN collapse limit\n"); + return ret; + } + + ret = pwr->bat_inserted(pwr); + if (ret < 0) + return ret; + + if (ret) { + /* WDT_FST auto set */ + ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_set1, + BD7182x_MASK_WDT_AUTO, + BD7182x_MASK_WDT_AUTO); + if (ret) + return ret; + + ret = bd7182x_write16(pwr, pwr->regs->vbat_alm_limit_u, + VBAT_LOW_TH); + if (ret) + return ret; + + /* + * On BD71815 "we mask the power-state" from relax detection. + * I am unsure what the impact of the power-state would be if + * we didn't - but this is what the vendor driver did - and + * that driver has been used in few projects so I just assume + * this is needed. + */ + if (pwr->chip_type == ROHM_CHIP_TYPE_BD71815) { + ret = regmap_set_bits(pwr->regmap, + BD71815_REG_REX_CTRL_1, + REX_PMU_STATE_MASK); + if (ret) + return ret; + } + } + + return 0; +} + +static int bd71828_charger_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent); + u32 vot; + u16 tmp; + int online; + int ret; + + switch (psp) { + case POWER_SUPPLY_PROP_ONLINE: + ret = get_chg_online(pwr, &online); + if (!ret) + val->intval = online; + break; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = bd7182x_read16_himask(pwr, pwr->regs->vdcin, + BD7182x_MASK_VDCIN_U, &tmp); + if (ret) + return ret; + + vot = tmp; + /* 5 milli volt steps */ + val->intval = 5000 * vot; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int bd71828_battery_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent); + int ret = 0; + int status, health, tmp, curr, curr_avg, chg_en; + + if (psp == POWER_SUPPLY_PROP_STATUS || + psp == POWER_SUPPLY_PROP_HEALTH || + psp == POWER_SUPPLY_PROP_CHARGE_TYPE) + ret = bd71828_charge_status(pwr, &status, &health); + else if (psp == POWER_SUPPLY_PROP_CURRENT_AVG || + psp == POWER_SUPPLY_PROP_CURRENT_NOW) + ret = bd71828_get_current_ds_adc(pwr, &curr, &curr_avg); + if (ret) + return ret; + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + val->intval = status; + break; + case POWER_SUPPLY_PROP_HEALTH: + val->intval = health; + break; + case POWER_SUPPLY_PROP_PRESENT: + ret = get_bat_online(pwr, &tmp); + if (!ret) + val->intval = tmp; + break; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = bd71828_get_vbat(pwr, &tmp); + val->intval = tmp; + break; + case POWER_SUPPLY_PROP_TECHNOLOGY: + val->intval = POWER_SUPPLY_TECHNOLOGY_LION; + break; + case POWER_SUPPLY_PROP_CURRENT_AVG: + val->intval = curr_avg; + break; + case POWER_SUPPLY_PROP_CURRENT_NOW: + val->intval = curr; + break; + case POWER_SUPPLY_PROP_CURRENT_MAX: + val->intval = MAX_CURRENT_DEFAULT; + break; + case POWER_SUPPLY_PROP_TEMP: + ret = pwr->get_temp(pwr, &val->intval); + break; + case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR: + ret = regmap_read(pwr->regmap, pwr->regs->chg_en, &chg_en); + if (ret) + return ret; + + val->intval = (chg_en & BD7182x_MASK_CHG_EN) ? + POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO : + POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int bd71828_battery_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *val) +{ + struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent); + int ret = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR: + if (val->intval == POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO) + ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en, + BD7182x_MASK_CHG_EN, + BD7182x_MASK_CHG_EN); + else + ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en, + BD7182x_MASK_CHG_EN, + 0); + break; + default: + return -EINVAL; + } + + return ret; +} + +static int bd71828_battery_property_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR: + return true; + default: + return false; + } +} + +/** @brief ac properties */ +static const enum power_supply_property bd71828_charger_props[] = { + POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_VOLTAGE_NOW, +}; + +static const enum power_supply_property bd71828_battery_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_CURRENT_AVG, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_CURRENT_MAX, + POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR, +}; + +/** @brief powers supplied by bd71828_ac */ +static char *bd71828_ac_supplied_to[] = { + BAT_NAME, +}; + +static const struct power_supply_desc bd71828_ac_desc = { + .name = AC_NAME, + .type = POWER_SUPPLY_TYPE_MAINS, + .properties = bd71828_charger_props, + .num_properties = ARRAY_SIZE(bd71828_charger_props), + .get_property = bd71828_charger_get_property, +}; + +static const struct power_supply_desc bd71828_bat_desc = { + .name = BAT_NAME, + .type = POWER_SUPPLY_TYPE_BATTERY, + .charge_behaviours = BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO) | + BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE), + .properties = bd71828_battery_props, + .num_properties = ARRAY_SIZE(bd71828_battery_props), + .get_property = bd71828_battery_get_property, + .set_property = bd71828_battery_set_property, + .property_is_writeable = bd71828_battery_property_is_writeable, +}; + +#define RSENS_CURR 10000000LLU + +#define BD_ISR_NAME(name) \ +bd7181x_##name##_isr + +#define BD_ISR_BAT(name, print, run_gauge) \ +static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \ +{ \ + struct bd71828_power *pwr = (struct bd71828_power *)data; \ + \ + dev_dbg(pwr->dev, "%s\n", print); \ + power_supply_changed(pwr->bat); \ + \ + return IRQ_HANDLED; \ +} + +#define BD_ISR_AC(name, print, run_gauge) \ +static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \ +{ \ + struct bd71828_power *pwr = (struct bd71828_power *)data; \ + \ + power_supply_changed(pwr->ac); \ + dev_dbg(pwr->dev, "%s\n", print); \ + power_supply_changed(pwr->bat); \ + \ + return IRQ_HANDLED; \ +} + +#define BD_ISR_DUMMY(name, print) \ +static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \ +{ \ + struct bd71828_power *pwr = (struct bd71828_power *)data; \ + \ + dev_dbg(pwr->dev, "%s\n", print); \ + \ + return IRQ_HANDLED; \ +} + +BD_ISR_BAT(chg_state_changed, "CHG state changed", true) +/* DCIN voltage changes */ +BD_ISR_AC(dcin_removed, "DCIN removed", true) +BD_ISR_AC(clps_out, "DCIN voltage back to normal", true) +BD_ISR_AC(clps_in, "DCIN voltage collapsed", false) +BD_ISR_AC(dcin_ovp_res, "DCIN voltage normal", true) +BD_ISR_AC(dcin_ovp_det, "DCIN OVER VOLTAGE", true) + +BD_ISR_DUMMY(dcin_mon_det, "DCIN voltage below threshold") +BD_ISR_DUMMY(dcin_mon_res, "DCIN voltage above threshold") + +BD_ISR_DUMMY(vsys_uv_res, "VSYS under-voltage cleared") +BD_ISR_DUMMY(vsys_uv_det, "VSYS under-voltage") +BD_ISR_DUMMY(vsys_low_res, "'VSYS low' cleared") +BD_ISR_DUMMY(vsys_low_det, "VSYS low") +BD_ISR_DUMMY(vsys_mon_res, "VSYS mon - resumed") +BD_ISR_DUMMY(vsys_mon_det, "VSYS mon - detected") +BD_ISR_BAT(chg_wdg_temp, "charger temperature watchdog triggered", true) +BD_ISR_BAT(chg_wdg, "charging watchdog triggered", true) +BD_ISR_BAT(bat_removed, "Battery removed", true) +BD_ISR_BAT(bat_det, "Battery detected", true) +/* TODO: Verify the meaning of these interrupts */ +BD_ISR_BAT(rechg_det, "Recharging", true) +BD_ISR_BAT(rechg_res, "Recharge ending", true) +BD_ISR_DUMMY(temp_transit, "Temperature transition") +BD_ISR_BAT(therm_rmv, "bd71815-therm-rmv", false) +BD_ISR_BAT(therm_det, "bd71815-therm-det", true) +BD_ISR_BAT(bat_dead, "bd71815-bat-dead", false) +BD_ISR_BAT(bat_short_res, "bd71815-bat-short-res", true) +BD_ISR_BAT(bat_short, "bd71815-bat-short-det", false) +BD_ISR_BAT(bat_low_res, "bd71815-bat-low-res", true) +BD_ISR_BAT(bat_low, "bd71815-bat-low-det", true) +BD_ISR_BAT(bat_ov_res, "bd71815-bat-over-res", true) +/* What should we do here? */ +BD_ISR_BAT(bat_ov, "bd71815-bat-over-det", false) +BD_ISR_BAT(bat_mon_res, "bd71815-bat-mon-res", true) +BD_ISR_BAT(bat_mon, "bd71815-bat-mon-det", true) +BD_ISR_BAT(bat_cc_mon, "bd71815-bat-cc-mon2", false) +BD_ISR_BAT(bat_oc1_res, "bd71815-bat-oc1-res", true) +BD_ISR_BAT(bat_oc1, "bd71815-bat-oc1-det", false) +BD_ISR_BAT(bat_oc2_res, "bd71815-bat-oc2-res", true) +BD_ISR_BAT(bat_oc2, "bd71815-bat-oc2-det", false) +BD_ISR_BAT(bat_oc3_res, "bd71815-bat-oc3-res", true) +BD_ISR_BAT(bat_oc3, "bd71815-bat-oc3-det", false) +BD_ISR_BAT(temp_bat_low_res, "bd71815-temp-bat-low-res", true) +BD_ISR_BAT(temp_bat_low, "bd71815-temp-bat-low-det", true) +BD_ISR_BAT(temp_bat_hi_res, "bd71815-temp-bat-hi-res", true) +BD_ISR_BAT(temp_bat_hi, "bd71815-temp-bat-hi-det", true) + +static irqreturn_t bd7182x_dcin_removed(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + power_supply_changed(pwr->ac); + dev_dbg(pwr->dev, "DCIN removed\n"); + + return IRQ_HANDLED; +} + +static irqreturn_t bd718x7_chg_done(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd7182x_dcin_detected(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "DCIN inserted\n"); + power_supply_changed(pwr->ac); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_vbat_low_res(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VBAT LOW Resumed\n"); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_vbat_low_det(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VBAT LOW Detected\n"); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_bat_hi_det(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_warn(pwr->dev, "Overtemp Detected\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_bat_hi_res(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "Overtemp Resumed\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_bat_low_det(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "Lowtemp Detected\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_bat_low_res(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "Lowtemp Resumed\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_vf_det(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VF Detected\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_vf_res(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VF Resumed\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_vf125_det(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VF125 Detected\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +static irqreturn_t bd71828_temp_vf125_res(int irq, void *data) +{ + struct bd71828_power *pwr = (struct bd71828_power *)data; + + dev_dbg(pwr->dev, "VF125 Resumed\n"); + power_supply_changed(pwr->bat); + + return IRQ_HANDLED; +} + +struct bd7182x_irq_res { + const char *name; + irq_handler_t handler; +}; + +#define BDIRQ(na, hn) { .name = (na), .handler = (hn) } + +static int bd7182x_get_irqs(struct platform_device *pdev, + struct bd71828_power *pwr) +{ + int i, irq, ret; + static const struct bd7182x_irq_res bd71815_irqs[] = { + BDIRQ("bd71815-dcin-rmv", BD_ISR_NAME(dcin_removed)), + BDIRQ("bd71815-dcin-clps-out", BD_ISR_NAME(clps_out)), + BDIRQ("bd71815-dcin-clps-in", BD_ISR_NAME(clps_in)), + BDIRQ("bd71815-dcin-ovp-res", BD_ISR_NAME(dcin_ovp_res)), + BDIRQ("bd71815-dcin-ovp-det", BD_ISR_NAME(dcin_ovp_det)), + BDIRQ("bd71815-dcin-mon-res", BD_ISR_NAME(dcin_mon_res)), + BDIRQ("bd71815-dcin-mon-det", BD_ISR_NAME(dcin_mon_det)), + + BDIRQ("bd71815-vsys-uv-res", BD_ISR_NAME(vsys_uv_res)), + BDIRQ("bd71815-vsys-uv-det", BD_ISR_NAME(vsys_uv_det)), + BDIRQ("bd71815-vsys-low-res", BD_ISR_NAME(vsys_low_res)), + BDIRQ("bd71815-vsys-low-det", BD_ISR_NAME(vsys_low_det)), + BDIRQ("bd71815-vsys-mon-res", BD_ISR_NAME(vsys_mon_res)), + BDIRQ("bd71815-vsys-mon-det", BD_ISR_NAME(vsys_mon_det)), + BDIRQ("bd71815-chg-wdg-temp", BD_ISR_NAME(chg_wdg_temp)), + BDIRQ("bd71815-chg-wdg", BD_ISR_NAME(chg_wdg)), + BDIRQ("bd71815-rechg-det", BD_ISR_NAME(rechg_det)), + BDIRQ("bd71815-rechg-res", BD_ISR_NAME(rechg_res)), + BDIRQ("bd71815-ranged-temp-transit", BD_ISR_NAME(temp_transit)), + BDIRQ("bd71815-chg-state-change", BD_ISR_NAME(chg_state_changed)), + BDIRQ("bd71815-bat-temp-normal", bd71828_temp_bat_hi_res), + BDIRQ("bd71815-bat-temp-erange", bd71828_temp_bat_hi_det), + BDIRQ("bd71815-bat-rmv", BD_ISR_NAME(bat_removed)), + BDIRQ("bd71815-bat-det", BD_ISR_NAME(bat_det)), + + /* Add ISRs for these */ + BDIRQ("bd71815-therm-rmv", BD_ISR_NAME(therm_rmv)), + BDIRQ("bd71815-therm-det", BD_ISR_NAME(therm_det)), + BDIRQ("bd71815-bat-dead", BD_ISR_NAME(bat_dead)), + BDIRQ("bd71815-bat-short-res", BD_ISR_NAME(bat_short_res)), + BDIRQ("bd71815-bat-short-det", BD_ISR_NAME(bat_short)), + BDIRQ("bd71815-bat-low-res", BD_ISR_NAME(bat_low_res)), + BDIRQ("bd71815-bat-low-det", BD_ISR_NAME(bat_low)), + BDIRQ("bd71815-bat-over-res", BD_ISR_NAME(bat_ov_res)), + BDIRQ("bd71815-bat-over-det", BD_ISR_NAME(bat_ov)), + BDIRQ("bd71815-bat-mon-res", BD_ISR_NAME(bat_mon_res)), + BDIRQ("bd71815-bat-mon-det", BD_ISR_NAME(bat_mon)), + /* cc-mon 1 & 3 ? */ + BDIRQ("bd71815-bat-cc-mon2", BD_ISR_NAME(bat_cc_mon)), + BDIRQ("bd71815-bat-oc1-res", BD_ISR_NAME(bat_oc1_res)), + BDIRQ("bd71815-bat-oc1-det", BD_ISR_NAME(bat_oc1)), + BDIRQ("bd71815-bat-oc2-res", BD_ISR_NAME(bat_oc2_res)), + BDIRQ("bd71815-bat-oc2-det", BD_ISR_NAME(bat_oc2)), + BDIRQ("bd71815-bat-oc3-res", BD_ISR_NAME(bat_oc3_res)), + BDIRQ("bd71815-bat-oc3-det", BD_ISR_NAME(bat_oc3)), + BDIRQ("bd71815-temp-bat-low-res", BD_ISR_NAME(temp_bat_low_res)), + BDIRQ("bd71815-temp-bat-low-det", BD_ISR_NAME(temp_bat_low)), + BDIRQ("bd71815-temp-bat-hi-res", BD_ISR_NAME(temp_bat_hi_res)), + BDIRQ("bd71815-temp-bat-hi-det", BD_ISR_NAME(temp_bat_hi)), + /* + * TODO: add rest of the IRQs and re-check the handling. + * Check the bd71815-bat-cc-mon1, bd71815-bat-cc-mon3, + * bd71815-bat-low-res, bd71815-bat-low-det, + * bd71815-bat-hi-res, bd71815-bat-hi-det. + */ + }; + static const struct bd7182x_irq_res bd71828_irqs[] = { + BDIRQ("bd71828-chg-done", bd718x7_chg_done), + BDIRQ("bd71828-pwr-dcin-in", bd7182x_dcin_detected), + BDIRQ("bd71828-pwr-dcin-out", bd7182x_dcin_removed), + BDIRQ("bd71828-vbat-normal", bd71828_vbat_low_res), + BDIRQ("bd71828-vbat-low", bd71828_vbat_low_det), + BDIRQ("bd71828-btemp-hi", bd71828_temp_bat_hi_det), + BDIRQ("bd71828-btemp-cool", bd71828_temp_bat_hi_res), + BDIRQ("bd71828-btemp-lo", bd71828_temp_bat_low_det), + BDIRQ("bd71828-btemp-warm", bd71828_temp_bat_low_res), + BDIRQ("bd71828-temp-hi", bd71828_temp_vf_det), + BDIRQ("bd71828-temp-norm", bd71828_temp_vf_res), + BDIRQ("bd71828-temp-125-over", bd71828_temp_vf125_det), + BDIRQ("bd71828-temp-125-under", bd71828_temp_vf125_res), + }; + int num_irqs; + const struct bd7182x_irq_res *irqs; + + switch (pwr->chip_type) { + case ROHM_CHIP_TYPE_BD71828: + irqs = &bd71828_irqs[0]; + num_irqs = ARRAY_SIZE(bd71828_irqs); + break; + case ROHM_CHIP_TYPE_BD71815: + irqs = &bd71815_irqs[0]; + num_irqs = ARRAY_SIZE(bd71815_irqs); + break; + default: + return -EINVAL; + } + + for (i = 0; i < num_irqs; i++) { + irq = platform_get_irq_byname(pdev, irqs[i].name); + + ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + irqs[i].handler, 0, + irqs[i].name, pwr); + if (ret) + break; + } + + return ret; +} + +#define RSENS_DEFAULT_30MOHM 30000 /* 30 mOhm in uOhms*/ + +static int bd7182x_get_rsens(struct bd71828_power *pwr) +{ + u64 tmp = RSENS_CURR; + int rsens_ohm = RSENS_DEFAULT_30MOHM; + struct fwnode_handle *node = NULL; + + if (pwr->dev->parent) + node = dev_fwnode(pwr->dev->parent); + + if (node) { + int ret; + u32 rs; + + ret = fwnode_property_read_u32(node, + "rohm,charger-sense-resistor-micro-ohms", + &rs); + if (ret) { + if (ret == -EINVAL) { + rs = RSENS_DEFAULT_30MOHM; + } else { + dev_err(pwr->dev, "Bad RSENS dt property\n"); + return ret; + } + } + if (!rs) { + dev_err(pwr->dev, "Bad RSENS value\n"); + return -EINVAL; + } + + rsens_ohm = (int)rs; + } + + /* Reg val to uA */ + do_div(tmp, rsens_ohm); + + pwr->curr_factor = tmp; + pwr->rsens = rsens_ohm; + dev_dbg(pwr->dev, "Setting rsens to %u micro ohm\n", pwr->rsens); + dev_dbg(pwr->dev, "Setting curr-factor to %u\n", pwr->curr_factor); + + return 0; +} + +static int bd71828_power_probe(struct platform_device *pdev) +{ + struct bd71828_power *pwr; + struct power_supply_config ac_cfg = {}; + struct power_supply_config bat_cfg = {}; + int ret; + struct regmap *regmap; + + regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!regmap) { + dev_err(&pdev->dev, "No parent regmap\n"); + return -EINVAL; + } + + pwr = devm_kzalloc(&pdev->dev, sizeof(*pwr), GFP_KERNEL); + if (!pwr) + return -ENOMEM; + + pwr->regmap = regmap; + pwr->dev = &pdev->dev; + pwr->chip_type = platform_get_device_id(pdev)->driver_data; + + switch (pwr->chip_type) { + case ROHM_CHIP_TYPE_BD71828: + pwr->bat_inserted = bd71828_bat_inserted; + pwr->get_temp = bd71828_get_temp; + pwr->regs = &pwr_regs_bd71828; + break; + case ROHM_CHIP_TYPE_BD71815: + pwr->bat_inserted = bd71815_bat_inserted; + pwr->get_temp = bd71815_get_temp; + pwr->regs = &pwr_regs_bd71815; + break; + default: + dev_err(pwr->dev, "Unknown PMIC\n"); + return -EINVAL; + } + + ret = bd7182x_get_rsens(pwr); + if (ret) + return dev_err_probe(&pdev->dev, ret, "sense resistor missing\n"); + + dev_set_drvdata(&pdev->dev, pwr); + bd71828_init_hardware(pwr); + + bat_cfg.drv_data = pwr; + bat_cfg.fwnode = dev_fwnode(&pdev->dev); + + ac_cfg.supplied_to = bd71828_ac_supplied_to; + ac_cfg.num_supplicants = ARRAY_SIZE(bd71828_ac_supplied_to); + ac_cfg.drv_data = pwr; + + pwr->ac = devm_power_supply_register(&pdev->dev, &bd71828_ac_desc, + &ac_cfg); + if (IS_ERR(pwr->ac)) + return dev_err_probe(&pdev->dev, PTR_ERR(pwr->ac), + "failed to register ac\n"); + + pwr->bat = devm_power_supply_register(&pdev->dev, &bd71828_bat_desc, + &bat_cfg); + if (IS_ERR(pwr->bat)) + return dev_err_probe(&pdev->dev, PTR_ERR(pwr->bat), + "failed to register bat\n"); + + ret = bd7182x_get_irqs(pdev, pwr); + if (ret) + return dev_err_probe(&pdev->dev, ret, "failed to request IRQs"); + + /* Configure wakeup capable */ + device_set_wakeup_capable(pwr->dev, 1); + device_set_wakeup_enable(pwr->dev, 1); + + return 0; +} + +static const struct platform_device_id bd71828_charger_id[] = { + { "bd71815-power", ROHM_CHIP_TYPE_BD71815 }, + { "bd71828-power", ROHM_CHIP_TYPE_BD71828 }, + { }, +}; +MODULE_DEVICE_TABLE(platform, bd71828_charger_id); + +static struct platform_driver bd71828_power_driver = { + .driver = { + .name = "bd718xx-power", + }, + .probe = bd71828_power_probe, + .id_table = bd71828_charger_id, +}; + +module_platform_driver(bd71828_power_driver); + +MODULE_AUTHOR("Cong Pham <cpham2403@gmail.com>"); +MODULE_DESCRIPTION("ROHM BD718(15/28/78) PMIC Battery Charger driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index 2263d5d3448f..0806abea2372 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -699,7 +699,13 @@ static int cw_bat_probe(struct i2c_client *client) if (!cw_bat->battery_workqueue) return -ENOMEM; - devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work); + ret = devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work); + if (ret) { + dev_err_probe(&client->dev, ret, + "Failed to register delayed work\n"); + return ret; + } + queue_delayed_work(cw_bat->battery_workqueue, &cw_bat->battery_delay_work, msecs_to_jiffies(10)); return 0; diff --git a/drivers/power/supply/max17040_battery.c b/drivers/power/supply/max17040_battery.c index c1640bc6accd..48453508688a 100644 --- a/drivers/power/supply/max17040_battery.c +++ b/drivers/power/supply/max17040_battery.c @@ -388,6 +388,7 @@ static int max17040_get_property(struct power_supply *psy, union power_supply_propval *val) { struct max17040_chip *chip = power_supply_get_drvdata(psy); + int ret; switch (psp) { case POWER_SUPPLY_PROP_ONLINE: @@ -410,7 +411,10 @@ static int max17040_get_property(struct power_supply *psy, if (!chip->channel_temp) return -ENODATA; - iio_read_channel_processed(chip->channel_temp, &val->intval); + ret = iio_read_channel_processed(chip->channel_temp, &val->intval); + if (ret) + return ret; + val->intval /= 100; /* Convert from milli- to deci-degree */ break; diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index b1a227bf72e2..5dd02f658f5b 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = { POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, }; +static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data) +{ + struct max77705_charger_data *chg = irq_drv_data; + unsigned int regval, irq_status; + int err; + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + + // irq is fiered at the end of current decrease sequence too + // early check AICL_I bit to guard against that excess irq call + while (!(irq_status & BIT(MAX77705_AICL_I))) { + err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®val); + if (err < 0) + return IRQ_HANDLED; + + regval--; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval); + if (err < 0) + return IRQ_HANDLED; + + msleep(AICL_WORK_DELAY_MS); + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + } + + return IRQ_HANDLED; +} + static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) { struct max77705_charger_data *chg = irq_drv_data; @@ -60,7 +93,7 @@ static const struct regmap_irq max77705_charger_irqs[] = { REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE), }; -static struct regmap_irq_chip max77705_charger_irq_chip = { +static const struct regmap_irq_chip max77705_charger_irq_chip = { .name = "max77705-charger", .status_base = MAX77705_CHG_REG_INT, .mask_base = MAX77705_CHG_REG_INT_MASK, @@ -567,6 +600,7 @@ static int max77705_charger_probe(struct i2c_client *i2c) { struct power_supply_config pscfg = {}; struct max77705_charger_data *chg; + struct regmap_irq_chip *chip_desc; struct device *dev; struct regmap_irq_chip_data *irq_data; int ret; @@ -580,6 +614,13 @@ static int max77705_charger_probe(struct i2c_client *i2c) chg->dev = dev; i2c_set_clientdata(i2c, chg); + chip_desc = devm_kmemdup(dev, &max77705_charger_irq_chip, + sizeof(max77705_charger_irq_chip), + GFP_KERNEL); + if (!chip_desc) + return -ENOMEM; + chip_desc->irq_drv_data = chg; + chg->regmap = devm_regmap_init_i2c(i2c, &max77705_chg_regmap_config); if (IS_ERR(chg->regmap)) return PTR_ERR(chg->regmap); @@ -599,11 +640,9 @@ static int max77705_charger_probe(struct i2c_client *i2c) if (IS_ERR(chg->psy_chg)) return PTR_ERR(chg->psy_chg); - max77705_charger_irq_chip.irq_drv_data = chg; ret = devm_regmap_add_irq_chip(chg->dev, chg->regmap, i2c->irq, IRQF_ONESHOT, 0, - &max77705_charger_irq_chip, - &irq_data); + chip_desc, &irq_data); if (ret) return dev_err_probe(dev, ret, "failed to add irq chip\n"); @@ -632,6 +671,15 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } + ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I), + NULL, max77705_aicl_irq, + IRQF_TRIGGER_NONE, + "aicl-irq", chg); + if (ret) { + dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n"); + goto destroy_wq; + } + ret = max77705_charger_enable(chg); if (ret) { dev_err_probe(dev, ret, "failed to enable charge\n"); diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 3c2837ef3461..c8028606bba0 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -678,12 +678,7 @@ static int qcom_battmgr_set_charge_start_threshold(struct qcom_battmgr *battmgr, u32 target_soc, delta_soc; int ret; - if (start_soc < CHARGE_CTRL_START_THR_MIN || - start_soc > CHARGE_CTRL_START_THR_MAX) { - dev_err(battmgr->dev, "charge control start threshold exceed range: [%u - %u]\n", - CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); - return -EINVAL; - } + start_soc = clamp(start_soc, CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); /* * If the new start threshold is larger than the old end threshold, @@ -716,12 +711,7 @@ static int qcom_battmgr_set_charge_end_threshold(struct qcom_battmgr *battmgr, i u32 delta_soc = CHARGE_CTRL_DELTA_SOC; int ret; - if (end_soc < CHARGE_CTRL_END_THR_MIN || - end_soc > CHARGE_CTRL_END_THR_MAX) { - dev_err(battmgr->dev, "charge control end threshold exceed range: [%u - %u]\n", - CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); - return -EINVAL; - } + end_soc = clamp(end_soc, CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); if (battmgr->info.charge_ctrl_start && end_soc > battmgr->info.charge_ctrl_start) delta_soc = end_soc - battmgr->info.charge_ctrl_start; diff --git a/drivers/power/supply/rt5033_charger.c b/drivers/power/supply/rt5033_charger.c index 2fdc58439707..de724f23e453 100644 --- a/drivers/power/supply/rt5033_charger.c +++ b/drivers/power/supply/rt5033_charger.c @@ -701,6 +701,8 @@ static int rt5033_charger_probe(struct platform_device *pdev) np_conn = of_parse_phandle(pdev->dev.of_node, "richtek,usb-connector", 0); np_edev = of_get_parent(np_conn); charger->edev = extcon_find_edev_by_node(np_edev); + of_node_put(np_edev); + of_node_put(np_conn); if (IS_ERR(charger->edev)) { dev_warn(charger->dev, "no extcon device found in device-tree\n"); goto out; diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c index fe773dd8b404..44c26fb37a77 100644 --- a/drivers/power/supply/rt9467-charger.c +++ b/drivers/power/supply/rt9467-charger.c @@ -376,7 +376,7 @@ static int rt9467_set_value_from_ranges(struct rt9467_chg_data *data, if (rsel == RT9467_RANGE_VMIVR) { ret = linear_range_get_selector_high(range, value, &sel, &found); if (ret) - value = range->max_sel; + sel = range->max_sel; } else { linear_range_get_selector_within(range, value, &sel); } @@ -588,6 +588,10 @@ static int rt9467_run_aicl(struct rt9467_chg_data *data) aicl_vth = mivr_vth + RT9467_AICLVTH_GAP_uV; ret = rt9467_set_value_from_ranges(data, F_AICL_VTH, RT9467_RANGE_AICL_VTH, aicl_vth); + if (ret) { + dev_err(data->dev, "Failed to set AICL VTH\n"); + return ret; + } /* Trigger AICL function */ ret = regmap_field_write(data->rm_field[F_AICL_MEAS], 1); diff --git a/drivers/power/supply/rt9756.c b/drivers/power/supply/rt9756.c new file mode 100644 index 000000000000..f254527be653 --- /dev/null +++ b/drivers/power/supply/rt9756.c @@ -0,0 +1,955 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright (C) 2025 Richtek Technology Corp. +// +// Authors: ChiYuan Huang <cy_huang@richtek.com> + +#include <linux/atomic.h> +#include <linux/cleanup.h> +#include <linux/i2c.h> +#include <linux/kernel.h> +#include <linux/linear_range.h> +#include <linux/interrupt.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/power_supply.h> +#include <linux/property.h> +#include <linux/regmap.h> +#include <linux/sysfs.h> +#include <linux/util_macros.h> + +#define RT9756_REG_INTFLAG1 0x0B +#define RT9756_REG_INTFLAG2 0x0D +#define RT9756_REG_INTFLAG3 0x0F +#define RT9756_REG_ADCCTL 0x11 +#define RT9756_REG_VBUSADC 0x12 +#define RT9756_REG_BC12FLAG 0x45 +#define RT9756_REG_INTFLAG4 0x49 + +/* Flag1 */ +#define RT9756_EVT_BUSOVP BIT(3) +#define RT9756_EVT_BUSOCP BIT(2) +#define RT9756_EVT_BUSUCP BIT(0) +/* Flag2 */ +#define RT9756_EVT_BATOVP BIT(7) +#define RT9756_EVT_BATOCP BIT(6) +#define RT9756_EVT_TDIEOTP BIT(3) +#define RT9756_EVT_VBUSLOW_ERR BIT(2) +#define RT9756_EVT_VAC_INSERT BIT(0) +/* Flag3 */ +#define RT9756_EVT_WDT BIT(5) +#define RT9756_EVT_VAC_UVLO BIT(4) +/* ADCCTL */ +#define RT9756_ADCEN_MASK BIT(7) +#define RT9756_ADCONCE_MASK BIT(6) +/* Bc12_flag */ +#define RT9756_EVT_BC12_DONE BIT(3) +/* Flag4 */ +#define RT9756_EVT_OUTOVP BIT(0) + +#define RICHTEK_DEVID 7 +#define RT9756_REVID 0 +#define RT9756A_REVID 1 +#define RT9757_REVID 2 +#define RT9757A_REVID 3 +#define RT9756_ADC_CONVTIME 1200 +#define RT9756_ADC_MAXWAIT 16000 + +enum rt9756_model { + MODEL_RT9756 = 0, + MODEL_RT9757, + MODEL_RT9770, + MODEL_MAX +}; + +enum rt9756_adc_chan { + ADC_VBUS = 0, + ADC_IBUS, + ADC_VBAT, + ADC_IBAT, + ADC_TDIE, + ADC_MAX_CHANNEL +}; + +enum rt9756_usb_type { + USB_NO_VBUS = 0, + USB_SDP = 2, + USB_NSTD, + USB_DCP, + USB_CDP, + MAX_USB_TYPE +}; + +enum rt9756_fields { + F_VBATOVP = 0, + F_VBATOVP_EN, + F_IBATOCP, + F_IBATOCP_EN, + F_VBUSOVP, + F_VBUSOVP_EN, + F_IBUSOCP, + F_IBUSOCP_EN, + F_SWITCHING, + F_REG_RST, + F_CHG_EN, + F_OP_MODE, + F_WDT_DIS, + F_WDT_TMR, + F_DEV_ID, + F_BC12_EN, + F_USB_STATE, + F_VBUS_STATE, + F_IBAT_RSEN, + F_REVISION, + F_MAX_FIELD +}; + +enum rt9756_ranges { + R_VBATOVP = 0, + R_IBATOCP, + R_VBUSOVP, + R_IBUSOCP, + R_MAX_RANGE +}; + +static const struct reg_field rt9756_chg_fields[F_MAX_FIELD] = { + [F_VBATOVP] = REG_FIELD(0x08, 0, 4), + [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7), + [F_IBATOCP] = REG_FIELD(0x09, 0, 5), + [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7), + [F_VBUSOVP] = REG_FIELD(0x06, 0, 5), + [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7), + [F_IBUSOCP] = REG_FIELD(0x07, 0, 4), + [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5), + [F_SWITCHING] = REG_FIELD(0x5c, 7, 7), + [F_REG_RST] = REG_FIELD(0x00, 7, 7), + [F_CHG_EN] = REG_FIELD(0x00, 6, 6), + [F_OP_MODE] = REG_FIELD(0x00, 5, 5), + [F_WDT_DIS] = REG_FIELD(0x00, 3, 3), + [F_WDT_TMR] = REG_FIELD(0x00, 0, 2), + [F_DEV_ID] = REG_FIELD(0x03, 0, 3), + [F_BC12_EN] = REG_FIELD(0x44, 7, 7), + [F_USB_STATE] = REG_FIELD(0x46, 5, 7), + [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0), + [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1), + [F_REVISION] = REG_FIELD(0x62, 0, 1), +}; + +static const struct reg_field rt9770_chg_fields[F_MAX_FIELD] = { + [F_VBATOVP] = REG_FIELD(0x08, 0, 4), + [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7), + [F_IBATOCP] = REG_FIELD(0x09, 0, 5), + [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7), + [F_VBUSOVP] = REG_FIELD(0x06, 0, 5), + [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7), + [F_IBUSOCP] = REG_FIELD(0x07, 0, 4), + [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5), + [F_SWITCHING] = REG_FIELD(0x5c, 7, 7), + [F_REG_RST] = REG_FIELD(0x00, 7, 7), + [F_CHG_EN] = REG_FIELD(0x00, 6, 6), + [F_OP_MODE] = REG_FIELD(0x00, 5, 5), + [F_WDT_DIS] = REG_FIELD(0x00, 3, 3), + [F_WDT_TMR] = REG_FIELD(0x00, 0, 2), + [F_DEV_ID] = REG_FIELD(0x60, 0, 3), + [F_BC12_EN] = REG_FIELD(0x03, 7, 7), + [F_USB_STATE] = REG_FIELD(0x02, 5, 7), + [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0), + [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1), + [F_REVISION] = REG_FIELD(0x62, 3, 7), +}; + +/* All converted to microvolt or microamp */ +static const struct linear_range rt9756_chg_ranges[R_MAX_RANGE] = { + LINEAR_RANGE_IDX(R_VBATOVP, 4200000, 0, 31, 25000), + LINEAR_RANGE_IDX(R_IBATOCP, 2000000, 0, 63, 100000), + LINEAR_RANGE_IDX(R_VBUSOVP, 3000000, 0, 63, 50000), + LINEAR_RANGE_IDX(R_IBUSOCP, 1000000, 0, 31, 250000), +}; + +struct charger_event { + unsigned int flag1; + unsigned int flag2; + unsigned int flag3; + unsigned int flag4; +}; + +struct rt9756_data { + struct device *dev; + struct regmap *regmap; + struct regmap_field *rm_fields[F_MAX_FIELD]; + struct power_supply *psy; + struct power_supply *bat_psy; + struct mutex adc_lock; + struct power_supply_desc psy_desc; + struct power_supply_desc bat_psy_desc; + struct charger_event chg_evt; + unsigned int rg_resistor; + unsigned int real_resistor; + enum rt9756_model model; + atomic_t usb_type; +}; + +struct rt975x_dev_data { + const struct regmap_config *regmap_config; + const struct reg_field *reg_fields; + const struct reg_sequence *init_regs; + size_t num_init_regs; + int (*check_device_model)(struct rt9756_data *data); +}; + +static int rt9756_get_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field, + enum rt9756_fields field, enum rt9756_ranges rsel, int *val) +{ + const struct linear_range *range = rt9756_chg_ranges + rsel; + unsigned int enable, selector, value; + int ret; + + ret = regmap_field_read(data->rm_fields[en_field], &enable); + if (ret) + return ret; + + if (!enable) { + *val = 0; + return 0; + } + + ret = regmap_field_read(data->rm_fields[field], &selector); + if (ret) + return ret; + + ret = linear_range_get_value(range, selector, &value); + if (ret) + return ret; + + *val = (int)value; + + return 0; +} + +static int rt9756_set_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field, + enum rt9756_fields field, enum rt9756_ranges rsel, int val) +{ + const struct linear_range *range = rt9756_chg_ranges + rsel; + unsigned int selector, value; + int ret; + + if (!val) + return regmap_field_write(data->rm_fields[en_field], 0); + + value = (unsigned int)val; + linear_range_get_selector_within(range, value, &selector); + ret = regmap_field_write(data->rm_fields[field], selector); + if (ret) + return ret; + + return regmap_field_write(data->rm_fields[en_field], 1); +} + +static int rt9756_get_adc(struct rt9756_data *data, enum rt9756_adc_chan chan, + int *val) +{ + struct regmap *regmap = data->regmap; + unsigned int reg_addr = RT9756_REG_VBUSADC + chan * 2; + unsigned int mask = RT9756_ADCEN_MASK | RT9756_ADCONCE_MASK; + unsigned int shift = 0, adc_cntl; + __be16 raws; + int scale, offset = 0, ret; + + guard(mutex)(&data->adc_lock); + + ret = regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, mask); + if (ret) + return ret; + + ret = regmap_read_poll_timeout(regmap, RT9756_REG_ADCCTL, adc_cntl, + !(adc_cntl & RT9756_ADCEN_MASK), + RT9756_ADC_CONVTIME, RT9756_ADC_MAXWAIT); + if (ret && ret != -ETIMEDOUT) + return ret; + + ret = regmap_raw_read(regmap, reg_addr, &raws, sizeof(raws)); + if (ret) + return ret; + + /* + * TDIE LSB 1'c, others LSB 1000uV or 1000uA. + * Rsense ratio is needed for IBAT channel + */ + if (chan == ADC_TDIE) { + scale = 10; + shift = 8; + offset = -40; + } else if (chan == ADC_IBAT) + scale = 1000 * data->rg_resistor / data->real_resistor; + else + scale = 1000; + + *val = ((be16_to_cpu(raws) >> shift) + offset) * scale; + + return regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, 0); +} + +static int rt9756_get_switching_state(struct rt9756_data *data, int *status) +{ + unsigned int switching_state; + int ret; + + ret = regmap_field_read(data->rm_fields[F_SWITCHING], &switching_state); + if (ret) + return ret; + + if (switching_state) + *status = POWER_SUPPLY_STATUS_CHARGING; + else + *status = POWER_SUPPLY_STATUS_NOT_CHARGING; + + return 0; +} + +static int rt9756_get_charger_health(struct rt9756_data *data) +{ + struct charger_event *evt = &data->chg_evt; + + if (evt->flag2 & RT9756_EVT_VBUSLOW_ERR) + return POWER_SUPPLY_HEALTH_UNDERVOLTAGE; + + if (evt->flag1 & RT9756_EVT_BUSOVP || evt->flag2 & RT9756_EVT_BATOVP || + evt->flag4 & RT9756_EVT_OUTOVP) + return POWER_SUPPLY_HEALTH_OVERVOLTAGE; + + if (evt->flag1 & RT9756_EVT_BUSOCP || evt->flag2 & RT9756_EVT_BATOCP) + return POWER_SUPPLY_HEALTH_OVERCURRENT; + + if (evt->flag1 & RT9756_EVT_BUSUCP) + return POWER_SUPPLY_HEALTH_UNSPEC_FAILURE; + + if (evt->flag2 & RT9756_EVT_TDIEOTP) + return POWER_SUPPLY_HEALTH_OVERHEAT; + + if (evt->flag3 & RT9756_EVT_WDT) + return POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE; + + return POWER_SUPPLY_HEALTH_GOOD; +} + +static int rt9756_get_charger_online(struct rt9756_data *data, int *val) +{ + unsigned int online; + int ret; + + ret = regmap_field_read(data->rm_fields[F_VBUS_STATE], &online); + if (ret) + return ret; + + *val = !!online; + return 0; +} + +static int rt9756_get_vbus_ovp(struct rt9756_data *data, int *val) +{ + unsigned int opmode; + int ovpval, ret; + + /* operating mode -> 0 bypass, 1 div2 */ + ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode); + if (ret) + return ret; + + ret = rt9756_get_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP, &ovpval); + if (ret) + return ret; + + *val = opmode ? ovpval * 2 : ovpval; + return 0; +} + +static int rt9756_set_vbus_ovp(struct rt9756_data *data, int val) +{ + unsigned int opmode; + int ret; + + /* operating mode -> 0 bypass, 1 div2 */ + ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode); + if (ret) + return ret; + + return rt9756_set_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP, + opmode ? val / 2 : val); +} + +static const char * const rt9756_manufacturer = "Richtek Technology Corp."; +static const char * const rt9756_model[MODEL_MAX] = { "RT9756", "RT9757", "RT9770" }; + +static int rt9756_psy_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct rt9756_data *data = power_supply_get_drvdata(psy); + int *pval = &val->intval; + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + return rt9756_get_switching_state(data, pval); + case POWER_SUPPLY_PROP_HEALTH: + *pval = rt9756_get_charger_health(data); + return 0; + case POWER_SUPPLY_PROP_ONLINE: + return rt9756_get_charger_online(data, pval); + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + return rt9756_get_vbus_ovp(data, pval); + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return rt9756_get_adc(data, ADC_VBUS, pval); + case POWER_SUPPLY_PROP_CURRENT_MAX: + return rt9756_get_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP, pval); + case POWER_SUPPLY_PROP_CURRENT_NOW: + return rt9756_get_adc(data, ADC_IBUS, pval); + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return rt9756_get_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP, pval); + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return rt9756_get_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP, pval); + case POWER_SUPPLY_PROP_TEMP: + return rt9756_get_adc(data, ADC_TDIE, pval); + case POWER_SUPPLY_PROP_USB_TYPE: + *pval = atomic_read(&data->usb_type); + return 0; + case POWER_SUPPLY_PROP_MODEL_NAME: + val->strval = rt9756_model[data->model]; + return 0; + case POWER_SUPPLY_PROP_MANUFACTURER: + val->strval = rt9756_manufacturer; + return 0; + default: + return -ENODATA; + } +} + +static int rt9756_psy_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *val) +{ + struct rt9756_data *data = power_supply_get_drvdata(psy); + int intval = val->intval; + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + memset(&data->chg_evt, 0, sizeof(data->chg_evt)); + return regmap_field_write(data->rm_fields[F_CHG_EN], !!intval); + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + return rt9756_set_vbus_ovp(data, intval); + case POWER_SUPPLY_PROP_CURRENT_MAX: + return rt9756_set_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP, + intval); + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return rt9756_set_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP, + intval); + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return rt9756_set_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP, + intval); + case POWER_SUPPLY_PROP_USB_TYPE: + return regmap_field_write(data->rm_fields[F_BC12_EN], !!intval); + default: + return -EINVAL; + } +} + +static const enum power_supply_property rt9756_psy_properties[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_VOLTAGE_MAX, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CURRENT_MAX, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_USB_TYPE, + POWER_SUPPLY_PROP_MODEL_NAME, + POWER_SUPPLY_PROP_MANUFACTURER, +}; + +static int rt9756_bat_psy_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct rt9756_data *data = power_supply_get_drvdata(psy); + int *pval = &val->intval; + + switch (psp) { + case POWER_SUPPLY_PROP_TECHNOLOGY: + *pval = POWER_SUPPLY_TECHNOLOGY_LION; + return 0; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return rt9756_get_adc(data, ADC_VBAT, pval); + case POWER_SUPPLY_PROP_CURRENT_NOW: + return rt9756_get_adc(data, ADC_IBAT, pval); + default: + return -ENODATA; + } +} + +static const enum power_supply_property rt9756_bat_psy_properties[] = { + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CURRENT_NOW, +}; + +static int rt9756_psy_property_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + case POWER_SUPPLY_PROP_ONLINE: + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_CURRENT_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + case POWER_SUPPLY_PROP_USB_TYPE: + return 1; + default: + return 0; + } +} + +static const unsigned int rt9756_wdt_millisecond[] = { + 500, 1000, 5000, 30000, 40000, 80000, 128000, 255000 +}; + +static ssize_t watchdog_timer_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct power_supply *psy = to_power_supply(dev); + struct rt9756_data *data = power_supply_get_drvdata(psy); + unsigned int wdt_tmr_now = 0, wdt_sel, wdt_dis; + int ret; + + ret = regmap_field_read(data->rm_fields[F_WDT_DIS], &wdt_dis); + if (ret) + return ret; + + if (!wdt_dis) { + ret = regmap_field_read(data->rm_fields[F_WDT_TMR], &wdt_sel); + if (ret) + return ret; + + wdt_tmr_now = rt9756_wdt_millisecond[wdt_sel]; + } + + return sysfs_emit(buf, "%d\n", wdt_tmr_now); +} + +static ssize_t watchdog_timer_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct power_supply *psy = to_power_supply(dev); + struct rt9756_data *data = power_supply_get_drvdata(psy); + unsigned int wdt_set, wdt_sel; + int ret; + + ret = kstrtouint(buf, 10, &wdt_set); + if (ret) + return ret; + + ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 1); + if (ret) + return ret; + + wdt_sel = find_closest(wdt_set, rt9756_wdt_millisecond, + ARRAY_SIZE(rt9756_wdt_millisecond)); + + ret = regmap_field_write(data->rm_fields[F_WDT_TMR], wdt_sel); + if (ret) + return ret; + + if (wdt_set) { + ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 0); + if (ret) + return ret; + } + + return count; +} + +static const char * const rt9756_opmode_str[] = { "bypass", "div2" }; + +static ssize_t operation_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct power_supply *psy = to_power_supply(dev); + struct rt9756_data *data = power_supply_get_drvdata(psy); + unsigned int opmode; + int ret; + + ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode); + if (ret) + return ret; + + return sysfs_emit(buf, "%s\n", rt9756_opmode_str[opmode]); +} + +static ssize_t operation_mode_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct power_supply *psy = to_power_supply(dev); + struct rt9756_data *data = power_supply_get_drvdata(psy); + int index, ret; + + index = sysfs_match_string(rt9756_opmode_str, buf); + if (index < 0) + return index; + + ret = regmap_field_write(data->rm_fields[F_OP_MODE], index); + + return ret ?: count; +} + +static DEVICE_ATTR_RW(watchdog_timer); +static DEVICE_ATTR_RW(operation_mode); + +static struct attribute *rt9756_sysfs_attrs[] = { + &dev_attr_watchdog_timer.attr, + &dev_attr_operation_mode.attr, + NULL +}; +ATTRIBUTE_GROUPS(rt9756_sysfs); + +static int rt9756_register_psy(struct rt9756_data *data) +{ + struct power_supply_desc *desc = &data->psy_desc; + struct power_supply_desc *bat_desc = &data->bat_psy_desc; + struct power_supply_config cfg = {}, bat_cfg = {}; + struct device *dev = data->dev; + char *psy_name, *bat_psy_name, **supplied_to; + + bat_cfg.drv_data = data; + bat_cfg.fwnode = dev_fwnode(dev); + + bat_psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s-battery", dev_name(dev)); + if (!bat_psy_name) + return -ENOMEM; + + bat_desc->name = bat_psy_name; + bat_desc->type = POWER_SUPPLY_TYPE_BATTERY; + bat_desc->properties = rt9756_bat_psy_properties; + bat_desc->num_properties = ARRAY_SIZE(rt9756_bat_psy_properties); + bat_desc->get_property = rt9756_bat_psy_get_property; + + data->bat_psy = devm_power_supply_register(dev, bat_desc, &bat_cfg); + if (IS_ERR(data->bat_psy)) + return dev_err_probe(dev, PTR_ERR(data->bat_psy), "Failed to register battery\n"); + + supplied_to = devm_kzalloc(dev, sizeof(*supplied_to), GFP_KERNEL); + if (!supplied_to) + return -ENOMEM; + + /* Link charger psy to battery psy */ + supplied_to[0] = bat_psy_name; + + cfg.drv_data = data; + cfg.fwnode = dev_fwnode(dev); + cfg.attr_grp = rt9756_sysfs_groups; + cfg.supplied_to = supplied_to; + cfg.num_supplicants = 1; + + psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s", dev_name(dev)); + if (!psy_name) + return -ENOMEM; + + desc->name = psy_name; + desc->type = POWER_SUPPLY_TYPE_USB; + desc->usb_types = BIT(POWER_SUPPLY_USB_TYPE_UNKNOWN) | BIT(POWER_SUPPLY_USB_TYPE_SDP) | + BIT(POWER_SUPPLY_USB_TYPE_DCP) | BIT(POWER_SUPPLY_USB_TYPE_CDP); + desc->properties = rt9756_psy_properties; + desc->num_properties = ARRAY_SIZE(rt9756_psy_properties); + desc->property_is_writeable = rt9756_psy_property_is_writeable; + desc->get_property = rt9756_psy_get_property; + desc->set_property = rt9756_psy_set_property; + + data->psy = devm_power_supply_register(dev, desc, &cfg); + + return PTR_ERR_OR_ZERO(data->psy); +} + +static int rt9756_get_usb_type(struct rt9756_data *data) +{ + unsigned int type; + int report_type, ret; + + ret = regmap_field_read(data->rm_fields[F_USB_STATE], &type); + if (ret) + return ret; + + switch (type) { + case USB_SDP: + case USB_NSTD: + report_type = POWER_SUPPLY_USB_TYPE_SDP; + break; + case USB_DCP: + report_type = POWER_SUPPLY_USB_TYPE_DCP; + break; + case USB_CDP: + report_type = POWER_SUPPLY_USB_TYPE_CDP; + break; + case USB_NO_VBUS: + default: + report_type = POWER_SUPPLY_USB_TYPE_UNKNOWN; + break; + } + + atomic_set(&data->usb_type, report_type); + return 0; +} + +static irqreturn_t rt9756_irq_handler(int irq, void *devid) +{ + struct rt9756_data *data = devid; + struct regmap *regmap = data->regmap; + struct charger_event *evt = &data->chg_evt; + unsigned int bc12_flag = 0; + int ret; + + ret = regmap_read(regmap, RT9756_REG_INTFLAG1, &evt->flag1); + if (ret) + return IRQ_NONE; + + ret = regmap_read(regmap, RT9756_REG_INTFLAG2, &evt->flag2); + if (ret) + return IRQ_NONE; + + ret = regmap_read(regmap, RT9756_REG_INTFLAG3, &evt->flag3); + if (ret) + return IRQ_NONE; + + if (data->model != MODEL_RT9770) { + ret = regmap_read(regmap, RT9756_REG_INTFLAG4, &evt->flag4); + if (ret) + return IRQ_NONE; + + ret = regmap_read(regmap, RT9756_REG_BC12FLAG, &bc12_flag); + if (ret) + return IRQ_NONE; + } + + dev_dbg(data->dev, "events: 0x%02x,%02x,%02x,%02x,%02x\n", evt->flag1, evt->flag2, + evt->flag3, evt->flag4, bc12_flag); + + if (evt->flag2 & RT9756_EVT_VAC_INSERT) { + ret = regmap_field_write(data->rm_fields[F_BC12_EN], 1); + if (ret) + return IRQ_NONE; + } + + if (evt->flag3 & RT9756_EVT_VAC_UVLO) + atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN); + + if (bc12_flag & RT9756_EVT_BC12_DONE) { + ret = rt9756_get_usb_type(data); + if (ret) + return IRQ_NONE; + } + + power_supply_changed(data->psy); + + return IRQ_HANDLED; +} + +static int rt9756_config_batsense_resistor(struct rt9756_data *data) +{ + unsigned int shunt_resistor_uohms = 2000, rsense_sel; + + device_property_read_u32(data->dev, "shunt-resistor-micro-ohms", &shunt_resistor_uohms); + + if (!shunt_resistor_uohms || shunt_resistor_uohms > 5000) + return -EINVAL; + + data->real_resistor = shunt_resistor_uohms; + + /* Always choose the larger or equal one to prevent false ocp alarm */ + if (shunt_resistor_uohms <= 1000) { + rsense_sel = 0; + data->rg_resistor = 1000; + } else if (shunt_resistor_uohms <= 2000) { + rsense_sel = 1; + data->rg_resistor = 2000; + } else { + rsense_sel = 2; + data->rg_resistor = 5000; + } + + return regmap_field_write(data->rm_fields[F_IBAT_RSEN], rsense_sel); +} + +static const struct reg_sequence rt9756_init_regs[] = { + REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */ + REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */ + REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */ + REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */ + REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */ + REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */ + REG_SEQ0(0x44, 0xa0), /* BC12_EN */ + REG_SEQ0(0x47, 0x07), /* MASK BC12FLAG */ + REG_SEQ0(0x4a, 0xfe), /* MASK FLAG4 */ + REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */ + REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */ +}; + +static const struct reg_sequence rt9770_init_regs[] = { + REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */ + REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */ + REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */ + REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */ + REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */ + REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */ + REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */ + REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */ +}; + +static const struct regmap_config rt9756_regmap_config = { + .name = "rt9756", + .reg_bits = 16, + .val_bits = 8, + .max_register = 0x1ff, +}; + +static const struct regmap_config rt9770_regmap_config = { + .name = "rt9770", + .reg_bits = 8, + .val_bits = 8, + .max_register = 0xff, +}; + +static int rt9756_check_device_model(struct rt9756_data *data) +{ + struct device *dev = data->dev; + unsigned int revid; + int ret; + + ret = regmap_field_read(data->rm_fields[F_REVISION], &revid); + if (ret) + return dev_err_probe(dev, ret, "Failed to read revid\n"); + + if (revid == RT9757_REVID || revid == RT9757A_REVID) + data->model = MODEL_RT9757; + else if (revid == RT9756_REVID || revid == RT9756A_REVID) + data->model = MODEL_RT9756; + else + return dev_err_probe(dev, -EINVAL, "Unknown revision %d\n", revid); + + return 0; +} + +static int rt9770_check_device_model(struct rt9756_data *data) +{ + data->model = MODEL_RT9770; + return 0; +} + +static int rt9756_probe(struct i2c_client *i2c) +{ + const struct rt975x_dev_data *dev_data; + struct device *dev = &i2c->dev; + struct rt9756_data *data; + struct regmap *regmap; + unsigned int devid; + int ret; + + dev_data = device_get_match_data(dev); + if (!dev_data) + return dev_err_probe(dev, -EINVAL, "No device data found\n"); + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->dev = dev; + mutex_init(&data->adc_lock); + atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN); + i2c_set_clientdata(i2c, data); + + regmap = devm_regmap_init_i2c(i2c, dev_data->regmap_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "Failed to init regmap\n"); + + data->regmap = regmap; + + ret = devm_regmap_field_bulk_alloc(dev, regmap, data->rm_fields, dev_data->reg_fields, + F_MAX_FIELD); + if (ret) + return dev_err_probe(dev, ret, "Failed to alloc regmap fields\n"); + + /* Richtek Device ID check */ + ret = regmap_field_read(data->rm_fields[F_DEV_ID], &devid); + if (ret) + return dev_err_probe(dev, ret, "Failed to read devid\n"); + + if (devid != RICHTEK_DEVID) + return dev_err_probe(dev, -ENODEV, "Incorrect VID 0x%02x\n", devid); + + /* Get specific model */ + ret = dev_data->check_device_model(data); + if (ret) + return ret; + + ret = regmap_register_patch(regmap, dev_data->init_regs, dev_data->num_init_regs); + if (ret) + return dev_err_probe(dev, ret, "Failed to init registers\n"); + + ret = rt9756_config_batsense_resistor(data); + if (ret) + return dev_err_probe(dev, ret, "Failed to config batsense resistor\n"); + + ret = rt9756_register_psy(data); + if (ret) + return dev_err_probe(dev, ret, "Failed to init power supply\n"); + + return devm_request_threaded_irq(dev, i2c->irq, NULL, rt9756_irq_handler, IRQF_ONESHOT, + dev_name(dev), data); +} + +static void rt9756_shutdown(struct i2c_client *i2c) +{ + struct rt9756_data *data = i2c_get_clientdata(i2c); + + regmap_field_write(data->rm_fields[F_REG_RST], 1); +} + +static const struct rt975x_dev_data rt9756_dev_data = { + .regmap_config = &rt9756_regmap_config, + .reg_fields = rt9756_chg_fields, + .init_regs = rt9756_init_regs, + .num_init_regs = ARRAY_SIZE(rt9756_init_regs), + .check_device_model = rt9756_check_device_model, +}; + +static const struct rt975x_dev_data rt9770_dev_data = { + .regmap_config = &rt9770_regmap_config, + .reg_fields = rt9770_chg_fields, + .init_regs = rt9770_init_regs, + .num_init_regs = ARRAY_SIZE(rt9770_init_regs), + .check_device_model = rt9770_check_device_model, +}; + +static const struct of_device_id rt9756_device_match_table[] = { + { .compatible = "richtek,rt9756", .data = &rt9756_dev_data }, + { .compatible = "richtek,rt9770", .data = &rt9770_dev_data }, + {} +}; +MODULE_DEVICE_TABLE(of, rt9756_device_match_table); + +static struct i2c_driver rt9756_charger_driver = { + .driver = { + .name = "rt9756", + .of_match_table = rt9756_device_match_table, + }, + .probe = rt9756_probe, + .shutdown = rt9756_shutdown, +}; +module_i2c_driver(rt9756_charger_driver); + +MODULE_DESCRIPTION("Richtek RT9756 charger driver"); +MODULE_AUTHOR("ChiYuan Huang <cy_huang@richtek.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/wm831x_power.c b/drivers/power/supply/wm831x_power.c index 6acdba7885ca..78fa0573ef25 100644 --- a/drivers/power/supply/wm831x_power.c +++ b/drivers/power/supply/wm831x_power.c @@ -144,6 +144,7 @@ static int wm831x_usb_limit_change(struct notifier_block *nb, struct wm831x_power, usb_notify); unsigned int i, best; + int ret; /* Find the highest supported limit */ best = 0; @@ -156,8 +157,13 @@ static int wm831x_usb_limit_change(struct notifier_block *nb, dev_dbg(wm831x_power->wm831x->dev, "Limiting USB current to %umA", wm831x_usb_limits[best]); - wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE, - WM831X_USB_ILIM_MASK, best); + ret = wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE, + WM831X_USB_ILIM_MASK, best); + if (ret < 0) { + dev_err(wm831x_power->wm831x->dev, + "Failed to set USB current limit: %d\n", ret); + return ret; + } return 0; } diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index ac0e132ccc3e..2a5b5a9fdcb3 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -53,9 +53,45 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, } EXPORT_SYMBOL_GPL(log_non_standard_event); -void log_arm_hw_error(struct cper_sec_proc_arm *err) +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { - trace_arm_event(err); + struct cper_arm_err_info *err_info; + struct cper_arm_ctx_info *ctx_info; + u8 *ven_err_data; + u32 ctx_len = 0; + int n, sz, cpu; + s32 vsei_len; + u32 pei_len; + u8 *pei_err, *ctx_err; + + pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; + pei_err = (u8 *)(err + 1); + + err_info = (struct cper_arm_err_info *)(err + 1); + ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); + ctx_err = (u8 *)ctx_info; + + for (n = 0; n < err->context_info_num; n++) { + sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); + ctx_len += sz; + } + + vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len); + if (vsei_len < 0) { + pr_warn(FW_BUG "section length: %d\n", err->section_length); + pr_warn(FW_BUG "section length is too small\n"); + pr_warn(FW_BUG "firmware-generated error record is incorrect\n"); + vsei_len = 0; + } + ven_err_data = (u8 *)ctx_info; + + cpu = GET_LOGICAL_INDEX(err->mpidr); + if (cpu < 0) + cpu = -1; + + trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, + ven_err_data, (u32)vsei_len, sev, cpu); } static int __init ras_init(void) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index ea532a8a4a0c..a596f6013019 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, - struct vfio_region_info *info, - unsigned long arg) +static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); int i; switch (info->index) { @@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, return 0; default: /* all other regions are handled via capability chain */ { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_type cap_type = { .header.id = VFIO_REGION_INFO_CAP_TYPE, .header.version = 1 }; @@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, cap_type.type = private->region[i].type; cap_type.subtype = private->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; - - info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info->argsz < sizeof(*info) + caps.size) { - info->argsz = sizeof(*info) + caps.size; - info->cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(*info)); - if (copy_to_user((void __user *)arg + sizeof(*info), - caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info->cap_offset = sizeof(*info); - } - - kfree(caps.buf); - } } return 0; @@ -532,24 +516,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -627,6 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, + .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index ff9adfc0b332..bdfd06516671 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -1528,7 +1528,6 @@ bfad_pci_slot_reset(struct pci_dev *pdev) goto out_disable_device; } - pci_save_state(pdev); pci_set_master(pdev); rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64)); diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c index 79c8dafdd49e..db0c2174430a 100644 --- a/drivers/scsi/csiostor/csio_init.c +++ b/drivers/scsi/csiostor/csio_init.c @@ -1093,7 +1093,6 @@ csio_pci_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); - pci_save_state(pdev); /* Bring HW s/m to ready state. * but don't resume IOs. diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 44214884deaf..95123689e9d1 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -7859,7 +7859,6 @@ static int ipr_reset_restore_cfg_space(struct ipr_cmnd *ipr_cmd) struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg; ENTER; - ioa_cfg->pdev->state_saved = true; pci_restore_state(ioa_cfg->pdev); if (ipr_set_pcix_cmd_reg(ioa_cfg)) { diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index f206267d9ecd..065eb91de9c0 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -14434,12 +14434,6 @@ lpfc_io_slot_reset_s3(struct pci_dev *pdev) pci_restore_state(pdev); - /* - * As the new kernel behavior of pci_restore_state() API call clears - * device saved_state flag, need to save the restored state again. - */ - pci_save_state(pdev); - if (pdev->is_busmaster) pci_set_master(pdev); diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 5ffd94586652..9007533e36e0 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -7886,11 +7886,6 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev) pci_restore_state(pdev); - /* pci_restore_state() clears the saved_state flag of the device - * save restored state which resets saved_state flag - */ - pci_save_state(pdev); - if (ha->mem_only) rc = pci_enable_device_mem(pdev); else diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 83ff66f954e6..97329c97332f 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -9796,11 +9796,6 @@ qla4xxx_pci_slot_reset(struct pci_dev *pdev) */ pci_restore_state(pdev); - /* pci_restore_state() clears the saved_state flag of the device - * save restored state which resets saved_state flag - */ - pci_save_state(pdev); - /* Initialize device or resume if in suspended state */ rc = pci_enable_device(pdev); if (rc) { diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 152f914c599d..65bd370f282a 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -6178,7 +6178,6 @@ static pci_ers_result_t serial8250_io_slot_reset(struct pci_dev *dev) return PCI_ERS_RESULT_DISCONNECT; pci_restore_state(dev); - pci_save_state(dev); return PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/tty/serial/jsm/jsm_driver.c b/drivers/tty/serial/jsm/jsm_driver.c index 417a5b6bffc3..8d21373cae57 100644 --- a/drivers/tty/serial/jsm/jsm_driver.c +++ b/drivers/tty/serial/jsm/jsm_driver.c @@ -355,7 +355,6 @@ static void jsm_io_resume(struct pci_dev *pdev) struct jsm_board *brd = pci_get_drvdata(pdev); pci_restore_state(pdev); - pci_save_state(pdev); jsm_uart_port_init(brd); } diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index a7936bd1aabe..ddaa1366704b 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1256,7 +1256,7 @@ static int query_virtqueues(struct mlx5_vdpa_net *ndev, int vq_idx = start_vq + i; if (cmd->err) { - mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, err); + mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, cmd->err); if (!err) err = cmd->err; continue; diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index 9e8d07078606..31a02e7fd7f2 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -736,6 +736,7 @@ static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs) octep_vdpa_assign_barspace(vf_pdev, pdev, index); if (++index == num_vfs) { done = true; + pci_dev_put(vf_pdev); break; } } diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 36f61cc96e21..43426bd971ac 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -51,7 +51,7 @@ static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv) err = pdsc_register_notify(nb); if (err) { nb->notifier_call = NULL; - dev_err(dev, "failed to register pds event handler: %ps\n", + dev_err(dev, "failed to register pds event handler: %pe\n", ERR_PTR(err)); return -EINVAL; } diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index e7bced0b5542..ae357d014564 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2173,7 +2173,8 @@ static int vduse_init(void) if (!vduse_irq_wq) goto err_wq; - vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0); + vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", + WQ_HIGHPRI | WQ_PERCPU, 0); if (!vduse_irq_bound_wq) goto err_bound_wq; diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad7686..253031b86b60 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, - struct vfio_region_info __user *arg) +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 480cac3a0c27..8ceca24ac136 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, return ret; if (user_size < minsz) return -EINVAL; - ret = copy_struct_from_user(&bind, minsz, arg, user_size); + ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size); if (ret) return ret; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 76ccbab0e3d6..ba47100f28c1 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + + if (info->index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..1e82b44bda1a 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" @@ -67,4 +70,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig" source "drivers/vfio/pci/qat/Kconfig" +source "drivers/vfio/pci/xe/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..e0a0757dd1d2 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o @@ -19,3 +20,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ obj-$(CONFIG_QAT_VFIO_PCI) += qat/ + +obj-$(CONFIG_XE_VFIO_PCI) += xe/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99e..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info->size = hisi_acc_get_resource_len(vdev, info->index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc1..cd55eba64dfb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..84d142a47ec6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,8 @@ #include <linux/vfio_pci_core.h> #include <linux/delay.h> #include <linux/jiffies.h> +#include <linux/pci-p2pdma.h> +#include <linux/pm_runtime.h> /* * The device memory usable to the workloads running in the VM is cached @@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; return 0; } -static long -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -252,49 +343,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -510,6 +580,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -536,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -563,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -627,6 +711,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -656,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -672,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -683,6 +781,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -690,6 +832,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -703,6 +846,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -710,6 +857,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -893,11 +1041,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -913,16 +1060,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -939,7 +1078,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; @@ -965,6 +1104,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); @@ -1002,12 +1144,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f67..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..0c771064c0b8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -147,6 +148,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +166,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4..dc4e510e6e1b 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. @@ -589,10 +590,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +630,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +712,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +909,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +994,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..3a11e6f450f7 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,6 +28,7 @@ #include <linux/nospec.h> #include <linux/sched/mm.h> #include <linux/iommufd.h> +#include <linux/pci-p2pdma.h> #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -41,6 +42,40 @@ static bool nointxmask; static bool disable_vga; static bool disable_idle_d3; +static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu) +{ + struct vfio_pci_eventfd *eventfd = + container_of(rcu, struct vfio_pci_eventfd, rcu); + + eventfd_ctx_put(eventfd->ctx); + kfree(eventfd); +} + +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx) +{ + struct vfio_pci_eventfd *new = NULL; + struct vfio_pci_eventfd *old; + + lockdep_assert_held(&vdev->igate); + + if (ctx) { + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->ctx = ctx; + } + + old = rcu_replace_pointer(*peventfd, new, + lockdep_is_held(&vdev->igate)); + if (old) + call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free); + + return 0; +} + /* List of PF's that vfio_pci_core_sriov_configure() has been called on */ static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); @@ -286,6 +321,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -299,11 +336,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -320,12 +355,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -373,14 +406,14 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -695,15 +728,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); - if (vdev->err_trigger) { - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = NULL; - } - if (vdev->req_trigger) { - eventfd_ctx_put(vdev->req_trigger); - vdev->req_trigger = NULL; - } + vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); + vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); mutex_unlock(&vdev->igate); } EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); @@ -996,42 +1025,36 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1043,9 +1066,9 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1055,16 +1078,17 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1073,10 +1097,10 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1085,53 +1109,36 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1227,7 +1234,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1457,8 +1467,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: @@ -1473,11 +1481,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1504,16 +1511,21 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -1640,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, - unsigned int order) +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, + unsigned long pfn, + unsigned int order) { - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - unsigned long pfn = vma_to_pfn(vma) + pgoff; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (order && (addr < vma->vm_start || - addr + (PAGE_SIZE << order) > vma->vm_end || - pfn & ((1 << order) - 1))) { - ret = VM_FAULT_FALLBACK; - goto out; - } - - down_read(&vdev->memory_lock); + lockdep_assert_held_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_unlock; + return VM_FAULT_SIGBUS; switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn); - break; + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, pfn, false); - break; + return vmf_insert_pfn_pmd(vmf, pfn, false); #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, pfn, false); + return vmf_insert_pfn_pud(vmf, pfn, false); break; #endif default: - ret = VM_FAULT_FALLBACK; + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn); + +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); } -out_unlock: - up_read(&vdev->memory_lock); -out: dev_dbg_ratelimited(&vdev->pdev->dev, "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", __func__, order, @@ -1749,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -1800,21 +1803,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->req_trigger) { + rcu_read_lock(); + eventfd = rcu_dereference(vdev->req_trigger); + if (eventfd) { if (!(count % 10)) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger); + eventfd_signal(eventfd->ctx); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); } - - mutex_unlock(&vdev->igate); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(vfio_pci_core_request); @@ -2085,6 +2088,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2094,6 +2098,10 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2227,13 +2235,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger); - - mutex_unlock(&vdev->igate); + rcu_read_lock(); + eventfd = rcu_dereference(vdev->err_trigger); + if (eventfd) + eventfd_signal(eventfd->ctx); + rcu_read_unlock(); return PCI_ERS_RESULT_CAN_RECOVER; } @@ -2458,6 +2466,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2486,8 +2495,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..d4d0f7d08c53 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include <linux/dma-buf-mapping.h> +#include <linux/pci-p2pdma.h> +#include <linux/dma-resv.h> + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +/* + * This is a temporary "private interconnect" between VFIO DMABUF and iommufd. + * It allows the two co-operating drivers to exchange the physical address of + * the BAR. This is to be replaced with a formal DMABUF system for negotiated + * interconnect types. + * + * If this function succeeds the following are true: + * - There is one physical range and it is pointing to MMIO + * - When move_notify is called it means revoke, not move, vfio_dma_buf_map + * will fail if it is currently revoked + */ +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct vfio_pci_dma_buf *priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops) + return -EOPNOTSUPP; + + priv = attachment->dmabuf->priv; + if (priv->revoked) + return -ENODEV; + + /* More than one range to iommufd will require proper DMABUF support */ + if (priv->nr_ranges != 1) + return -EOPNOTSUPP; + + *phys = priv->phys_vec[0]; + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd"); + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 30d3e921cb0d..c76e753b3cec 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, +static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, unsigned int count, uint32_t flags, void *data) { /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (*ctx) { - if (count) { - eventfd_signal(*ctx); - } else { - eventfd_ctx_put(*ctx); - *ctx = NULL; - } + struct vfio_pci_eventfd *eventfd; + + eventfd = rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (!eventfd) + return -EINVAL; + + if (count) { + eventfd_signal(eventfd->ctx); return 0; } + + return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger; @@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, return -EINVAL; trigger = *(uint8_t *)data; - if (trigger && *ctx) - eventfd_signal(*ctx); + + if (trigger) { + struct vfio_pci_eventfd *eventfd = + rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (eventfd) + eventfd_signal(eventfd->ctx); + } return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { @@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, fd = *(int32_t *)data; if (fd == -1) { - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = NULL; + return vfio_pci_eventfd_replace_locked(vdev, + peventfd, NULL); } else if (fd >= 0) { struct eventfd_ctx *efdctx; + int ret; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); - if (*ctx) - eventfd_ctx_put(*ctx); + ret = vfio_pci_eventfd_replace_locked(vdev, + peventfd, efdctx); + if (ret) + eventfd_ctx_put(efdctx); - *ctx = efdctx; + return ret; } - return 0; } return -EINVAL; @@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger, count, flags, data); } @@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger, count, flags, data); } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..27ac280f00b9 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd { bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx); + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data); @@ -60,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, @@ -107,4 +110,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,9 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,41 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; - struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -108,7 +109,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -130,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig new file mode 100644 index 000000000000..cc9b6dac6ed3 --- /dev/null +++ b/drivers/vfio/pci/xe/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config XE_VFIO_PCI + tristate "VFIO support for Intel Graphics" + depends on DRM_XE && PCI_IOV + select VFIO_PCI_CORE + help + This option enables device specific VFIO driver variant for Intel Graphics. + In addition to generic VFIO PCI functionality, it implements VFIO + migration uAPI allowing userspace to enable migration for + Intel Graphics SR-IOV Virtual Functions supported by the Xe driver. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile new file mode 100644 index 000000000000..13aa0fd192cd --- /dev/null +++ b/drivers/vfio/pci/xe/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o +xe-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c new file mode 100644 index 000000000000..0156b53c678b --- /dev/null +++ b/drivers/vfio/pci/xe/main.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2025 Intel Corporation + */ + +#include <linux/anon_inodes.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/sizes.h> +#include <linux/types.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include <drm/intel/xe_sriov_vfio.h> +#include <drm/intel/pciids.h> + +struct xe_vfio_pci_migration_file { + struct file *filp; + /* serializes accesses to migration data */ + struct mutex lock; + struct xe_vfio_pci_core_device *xe_vdev; + u8 disabled:1; +}; + +struct xe_vfio_pci_core_device { + struct vfio_pci_core_device core_device; + struct xe_device *xe; + /* PF internal control uses vfid index starting from 1 */ + unsigned int vfid; + u8 deferred_reset:1; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protects the reset_done flow */ + spinlock_t reset_lock; + struct xe_vfio_pci_migration_file *migf; +}; + +#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) + +static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + mutex_unlock(&migf->lock); +} + +static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) +{ + xe_vfio_pci_disable_file(xe_vdev->migf); + fput(xe_vdev->migf->filp); + xe_vdev->migf = NULL; +} + +static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; +} + +static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) +{ + mutex_lock(&xe_vdev->state_mutex); +} + +/* + * This function is called in all state_mutex unlock cases to + * handle a 'deferred_reset' if exists. + */ +static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) +{ +again: + spin_lock(&xe_vdev->reset_lock); + if (xe_vdev->deferred_reset) { + xe_vdev->deferred_reset = false; + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_reset(xe_vdev); + goto again; + } + mutex_unlock(&xe_vdev->state_mutex); + spin_unlock(&xe_vdev->reset_lock); +} + +static void xe_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + int ret; + + if (!pdev->is_virtfn) + return; + + /* + * VF FLR requires additional processing done by PF driver. + * The processing is done after FLR is already finished from PCIe + * perspective. + * In order to avoid a scenario where VF is used while PF processing + * is still in progress, additional synchronization point is needed. + */ + ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); + if (ret) + dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); + + if (!xe_vdev->vfid) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&xe_vdev->reset_lock); + xe_vdev->deferred_reset = true; + if (!mutex_trylock(&xe_vdev->state_mutex)) { + spin_unlock(&xe_vdev->reset_lock); + return; + } + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + xe_vfio_pci_reset(xe_vdev); +} + +static const struct pci_error_handlers xe_vfio_pci_err_handlers = { + .reset_done = xe_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &xe_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + xe_vfio_pci_reset(xe_vdev); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_save_fops = { + .owner = THIS_MODULE, + .read = xe_vfio_pci_save_read, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_resume_fops = { + .owner = THIS_MODULE, + .write = xe_vfio_pci_resume_write, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static const char *vfio_dev_state_str(u32 state) +{ + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: return "running"; + case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; + case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; + case VFIO_DEVICE_STATE_STOP: return "stop"; + case VFIO_DEVICE_STATE_RESUMING: return "resuming"; + case VFIO_DEVICE_STATE_ERROR: return "error"; + default: return ""; + } +} + +enum xe_vfio_pci_file_type { + XE_VFIO_FILE_SAVE = 0, + XE_VFIO_FILE_RESUME, +}; + +static struct xe_vfio_pci_migration_file * +xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, + enum xe_vfio_pci_file_type type) +{ + struct xe_vfio_pci_migration_file *migf; + const struct file_operations *fops; + int flags; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; + flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; + migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); + if (IS_ERR(migf->filp)) { + kfree(migf); + return ERR_CAST(migf->filp); + } + + mutex_init(&migf->lock); + migf->xe_vdev = xe_vdev; + xe_vdev->migf = migf; + + stream_open(migf->filp->f_inode, migf->filp); + + return migf; +} + +static struct file * +xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) +{ + u32 cur = xe_vdev->mig_state; + int ret; + + dev_dbg(xe_vdev_to_dev(xe_vdev), + "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); + + /* + * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't + * have the capability to selectively block outgoing p2p DMA transfers. + * While the device is allowing BAR accesses when the VF is stopped, it + * is not processing any new workload requests, effectively stopping + * any outgoing DMA transfers (not just p2p). + * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and + * will be migrated to target VF during stop-copy. + */ + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + WARN(true, "Unknown state transition %d->%d", cur, new); + return ERR_PTR(-EINVAL); + +err: + dev_dbg(xe_vdev_to_dev(xe_vdev), + "Failed to transition state: %s->%s err=%d\n", + vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); + return ERR_PTR(ret); +} + +static struct file * +xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state new_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *f = NULL; + int ret; + + xe_vfio_pci_state_mutex_lock(xe_vdev); + while (new_state != xe_vdev->mig_state) { + ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, + new_state, &next_state); + if (ret) { + xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); + f = ERR_PTR(ret); + break; + } + f = xe_vfio_set_state(xe_vdev, next_state); + if (IS_ERR(f)) + break; + + xe_vdev->mig_state = next_state; + + /* Multiple state transitions with non-NULL file in the middle */ + if (f && new_state != xe_vdev->mig_state) { + fput(f); + f = ERR_PTR(-EINVAL); + break; + } + } + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return f; +} + +static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state *curr_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *curr_state = xe_vdev->mig_state; + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { + .migration_set_state = xe_vfio_pci_set_device_state, + .migration_get_state = xe_vfio_pci_get_device_state, + .migration_get_data_size = xe_vfio_pci_get_data_size, +}; + +static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) +{ + struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); + + if (!xe) + return; + if (!xe_sriov_vfio_migration_supported(xe)) + return; + + mutex_init(&xe_vdev->state_mutex); + spin_lock_init(&xe_vdev->reset_lock); + + /* PF internal control uses vfid index starting from 1 */ + xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; + xe_vdev->xe = xe; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + core_vdev->mig_ops = &xe_vfio_pci_migration_ops; +} + +static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (!xe_vdev->vfid) + return; + + mutex_destroy(&xe_vdev->state_mutex); +} + +static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_init(xe_vdev); + + return vfio_pci_core_init_dev(core_vdev); +} + +static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_fini(xe_vdev); +} + +static const struct vfio_device_ops xe_vfio_pci_ops = { + .name = "xe-vfio-pci", + .init = xe_vfio_pci_init_dev, + .release = xe_vfio_pci_release_dev, + .open_device = xe_vfio_pci_open_device, + .close_device = xe_vfio_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct xe_vfio_pci_core_device *xe_vdev; + int ret; + + xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, + &xe_vfio_pci_ops); + if (IS_ERR(xe_vdev)) + return PTR_ERR(xe_vdev); + + dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); + + ret = vfio_pci_core_register_device(&xe_vdev->core_device); + if (ret) { + vfio_put_device(&xe_vdev->core_device.vdev); + return ret; + } + + return 0; +} + +static void xe_vfio_pci_remove(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + + vfio_pci_core_unregister_device(&xe_vdev->core_device); + vfio_put_device(&xe_vdev->core_device.vdev); +} + +#define INTEL_PCI_VFIO_DEVICE(_id) { \ + PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ +} + +static const struct pci_device_id xe_vfio_pci_table[] = { + INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), + {} +}; +MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); + +static struct pci_driver xe_vfio_pci_driver = { + .name = "xe-vfio-pci", + .id_table = xe_vfio_pci_table, + .probe = xe_vfio_pci_probe, + .remove = xe_vfio_pci_remove, + .err_handler = &xe_vfio_pci_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(xe_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("MichaĆ Winiarski <michal.winiarski@intel.com>"); +MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 9f5c527baa8a..fa754f203b2d 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7..a4d3ace3e02d 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd795..c2990b7e900f 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,24 @@ err_irq: } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + if (info->index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +318,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab516849..05084212a76e 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,9 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..f7df90c423b4 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API @@ -1259,6 +1261,51 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1296,6 +1343,10 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + ret = vfio_get_region_info(device, uptr); + break; + default: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8f7f50acb6d6..7f886d3dba7d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -69,15 +69,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) -static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = { - VHOST_FEATURES | - (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | - (1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_F_RING_RESET) | - (1ULL << VIRTIO_F_IN_ORDER), - VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) | - VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO), +static const int vhost_net_bits[] = { + VHOST_FEATURES, + VHOST_NET_F_VIRTIO_NET_HDR, + VIRTIO_NET_F_MRG_RXBUF, + VIRTIO_F_ACCESS_PLATFORM, + VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, + VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, + VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO }; enum { @@ -1731,7 +1731,8 @@ out: static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { - u64 all_features[VIRTIO_FEATURES_DWORDS]; + const DEFINE_VHOST_FEATURES_ARRAY(vhost_net_features, vhost_net_bits); + u64 all_features[VIRTIO_FEATURES_U64S]; struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; @@ -1763,7 +1764,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, /* Copy the net features, up to the user-provided buffer size */ argp += sizeof(u64); - copied = min(count, VIRTIO_FEATURES_DWORDS); + copied = min(count, (u64)VIRTIO_FEATURES_U64S); if (copy_to_user(argp, vhost_net_features, copied * sizeof(u64))) return -EFAULT; @@ -1778,13 +1779,13 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, virtio_features_zero(all_features); argp += sizeof(u64); - copied = min(count, VIRTIO_FEATURES_DWORDS); + copied = min(count, (u64)VIRTIO_FEATURES_U64S); if (copy_from_user(all_features, argp, copied * sizeof(u64))) return -EFAULT; /* * Any feature specified by user-space above - * VIRTIO_FEATURES_MAX is not supported by definition. + * VIRTIO_FEATURES_BITS is not supported by definition. */ for (i = copied; i < count; ++i) { if (copy_from_user(&features, featurep + 1 + i, @@ -1794,7 +1795,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, return -EOPNOTSUPP; } - for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++) + for (i = 0; i < VIRTIO_FEATURES_U64S; i++) if (all_features[i] & ~vhost_net_features[i]) return -EOPNOTSUPP; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 98e4f68f4e3c..f43c1fe9fad9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -197,11 +197,14 @@ enum { }; /* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */ -enum { - VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) | - (1ULL << VIRTIO_SCSI_F_T10_PI) +static const int vhost_scsi_bits[] = { + VHOST_FEATURES, + VIRTIO_SCSI_F_HOTPLUG, + VIRTIO_SCSI_F_T10_PI }; +#define VHOST_SCSI_FEATURES VHOST_FEATURES_U64(vhost_scsi_bits, 0) + #define VHOST_SCSI_MAX_TARGET 256 #define VHOST_SCSI_MAX_IO_VQ 1024 #define VHOST_SCSI_MAX_EVENT 128 diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 42c955a5b211..1e4e36edbcd2 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -28,6 +28,12 @@ */ #define VHOST_TEST_PKT_WEIGHT 256 +static const int vhost_test_bits[] = { + VHOST_FEATURES +}; + +#define VHOST_TEST_FEATURES VHOST_FEATURES_U64(vhost_test_bits, 0) + enum { VHOST_TEST_VQ = 0, VHOST_TEST_VQ_MAX = 1, @@ -328,14 +334,14 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, return -EFAULT; return vhost_test_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: - features = VHOST_FEATURES; + features = VHOST_TEST_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; - if (features & ~VHOST_FEATURES) + if (features & ~VHOST_TEST_FEATURES) return -EOPNOTSUPP; return vhost_test_set_features(n, features); case VHOST_RESET_OWNER: diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a78226b37739..bccdc9eab267 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -804,11 +804,13 @@ static int vhost_kthread_worker_create(struct vhost_worker *worker, ret = vhost_attach_task_to_cgroups(worker); if (ret) - goto stop_worker; + goto free_id; worker->id = id; return 0; +free_id: + xa_erase(&dev->worker_xa, id); stop_worker: vhost_kthread_do_stop(worker); return ret; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index b49f08e4a1b4..4fe99765c5c7 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -14,6 +14,7 @@ #include <linux/atomic.h> #include <linux/vhost_iotlb.h> #include <linux/irqbypass.h> +#include <linux/unroll.h> struct vhost_work; struct vhost_task; @@ -287,14 +288,39 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, eventfd_signal((vq)->error_ctx);\ } while (0) -enum { - VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | - (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | - (1ULL << VIRTIO_RING_F_EVENT_IDX) | - (1ULL << VHOST_F_LOG_ALL) | - (1ULL << VIRTIO_F_ANY_LAYOUT) | - (1ULL << VIRTIO_F_VERSION_1) -}; +#define VHOST_FEATURES \ + VIRTIO_F_NOTIFY_ON_EMPTY, \ + VIRTIO_RING_F_INDIRECT_DESC, \ + VIRTIO_RING_F_EVENT_IDX, \ + VHOST_F_LOG_ALL, \ + VIRTIO_F_ANY_LAYOUT, \ + VIRTIO_F_VERSION_1 + +static inline u64 vhost_features_u64(const int *features, int size, int idx) +{ + u64 res = 0; + + unrolled_count(VIRTIO_FEATURES_BITS) + for (int i = 0; i < size; ++i) { + int bit = features[i]; + + if (virtio_features_chk_bit(bit) && VIRTIO_U64(bit) == idx) + res |= VIRTIO_BIT(bit); + } + return res; +} + +#define VHOST_FEATURES_U64(features, idx) \ + vhost_features_u64(features, ARRAY_SIZE(features), idx) + +#define DEFINE_VHOST_FEATURES_ARRAY_ENTRY(idx, features) \ + [idx] = VHOST_FEATURES_U64(features, idx), + +#define DEFINE_VHOST_FEATURES_ARRAY(array, features) \ + u64 array[VIRTIO_FEATURES_U64S] = { \ + UNROLL(VIRTIO_FEATURES_U64S, \ + DEFINE_VHOST_FEATURES_ARRAY_ENTRY, features) \ + } /** * vhost_vq_set_backend - Set backend. diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ae01457ea2cd..0298ddc34824 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -29,12 +29,14 @@ */ #define VHOST_VSOCK_PKT_WEIGHT 256 -enum { - VHOST_VSOCK_FEATURES = VHOST_FEATURES | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_VSOCK_F_SEQPACKET) +static const int vhost_vsock_bits[] = { + VHOST_FEATURES, + VIRTIO_F_ACCESS_PLATFORM, + VIRTIO_VSOCK_F_SEQPACKET }; +#define VHOST_VSOCK_FEATURES VHOST_FEATURES_U64(vhost_vsock_bits, 0) + enum { VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index a09eb4d62f82..5bdc6b82b30b 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -53,7 +53,7 @@ static ssize_t features_show(struct device *_d, /* We actually represent this as a bitstring, as it could be * arbitrary length in future. */ - for (i = 0; i < VIRTIO_FEATURES_MAX; i++) + for (i = 0; i < VIRTIO_FEATURES_BITS; i++) len += sysfs_emit_at(buf, len, "%c", __virtio_test_bit(dev, i) ? '1' : '0'); len += sysfs_emit_at(buf, len, "\n"); @@ -272,8 +272,8 @@ static int virtio_dev_probe(struct device *_d) int err, i; struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); - u64 device_features[VIRTIO_FEATURES_DWORDS]; - u64 driver_features[VIRTIO_FEATURES_DWORDS]; + u64 device_features[VIRTIO_FEATURES_U64S]; + u64 driver_features[VIRTIO_FEATURES_U64S]; u64 driver_features_legacy; /* We have a driver! */ @@ -286,7 +286,7 @@ static int virtio_dev_probe(struct device *_d) virtio_features_zero(driver_features); for (i = 0; i < drv->feature_table_size; i++) { unsigned int f = drv->feature_table[i]; - if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX)) + if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_BITS)) virtio_features_set_bit(driver_features, f); } @@ -303,7 +303,7 @@ static int virtio_dev_probe(struct device *_d) } if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) { - for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i) + for (i = 0; i < VIRTIO_FEATURES_U64S; ++i) dev->features_array[i] = driver_features[i] & device_features[i]; } else { @@ -325,7 +325,7 @@ static int virtio_dev_probe(struct device *_d) goto err; if (drv->validate) { - u64 features[VIRTIO_FEATURES_DWORDS]; + u64 features[VIRTIO_FEATURES_U64S]; virtio_features_copy(features, dev->features_array); err = drv->validate(dev); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1b93d8c64361..74fe59f5a78c 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -983,7 +983,8 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_del_vqs; } vb->balloon_wq = alloc_workqueue("balloon-wq", - WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); + WQ_FREEZABLE | WQ_CPU_INTENSIVE | WQ_PERCPU, + 0); if (!vb->balloon_wq) { err = -ENOMEM; goto out_del_vqs; diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c index d58713ddf2e5..ccf1955a1183 100644 --- a/drivers/virtio/virtio_debug.c +++ b/drivers/virtio/virtio_debug.c @@ -8,12 +8,12 @@ static struct dentry *virtio_debugfs_dir; static int virtio_debug_device_features_show(struct seq_file *s, void *data) { - u64 device_features[VIRTIO_FEATURES_DWORDS]; + u64 device_features[VIRTIO_FEATURES_U64S]; struct virtio_device *dev = s->private; unsigned int i; virtio_get_features(dev, device_features); - for (i = 0; i < VIRTIO_FEATURES_MAX; i++) { + for (i = 0; i < VIRTIO_FEATURES_BITS; i++) { if (virtio_features_test_bit(device_features, i)) seq_printf(s, "%u\n", i); } @@ -26,7 +26,7 @@ static int virtio_debug_filter_features_show(struct seq_file *s, void *data) struct virtio_device *dev = s->private; unsigned int i; - for (i = 0; i < VIRTIO_FEATURES_MAX; i++) { + for (i = 0; i < VIRTIO_FEATURES_BITS; i++) { if (virtio_features_test_bit(dev->debugfs_filter_features, i)) seq_printf(s, "%u\n", i); } @@ -50,7 +50,7 @@ static int virtio_debug_filter_feature_add(void *data, u64 val) { struct virtio_device *dev = data; - if (val >= VIRTIO_FEATURES_MAX) + if (val >= VIRTIO_FEATURES_BITS) return -EINVAL; virtio_features_set_bit(dev->debugfs_filter_features, val); @@ -64,7 +64,7 @@ static int virtio_debug_filter_feature_del(void *data, u64 val) { struct virtio_device *dev = data; - if (val >= VIRTIO_FEATURES_MAX) + if (val >= VIRTIO_FEATURES_BITS) return -EINVAL; virtio_features_clear_bit(dev->debugfs_filter_features, val); diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 9e503b7a58d8..413a8c353463 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -401,7 +401,7 @@ void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev, int i; virtio_features_zero(features); - for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) { + for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) { u64 cur; vp_iowrite32(i, &cfg->device_feature_select); @@ -427,7 +427,7 @@ vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev, int i; virtio_features_zero(features); - for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) { + for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) { u64 cur; vp_iowrite32(i, &cfg->guest_feature_select); @@ -448,7 +448,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev, struct virtio_pci_common_cfg __iomem *cfg = mdev->common; int i; - for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) { + for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) { u32 cur = features[i >> 1] >> (32 * (i & 1)); vp_iowrite32(i, &cfg->guest_feature_select); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 7b6205253b46..ddab68959671 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -3166,6 +3166,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_alloc_coherent); * @vdev: the virtio device we are talking to * @map: metadata for performing mapping * @size: the size of the buffer + * @vaddr: the virtual address that needs to be freed * @map_handle: the mapped address that needs to be freed * */ @@ -3190,7 +3191,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_free_coherent); * @dir: mapping direction * @attrs: mapping attributes * - * Returns mapped address. Caller should check that by virtqueue_mapping_error(). + * Returns mapped address. Caller should check that by virtqueue_map_mapping_error(). */ dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq, struct page *page, @@ -3249,7 +3250,7 @@ EXPORT_SYMBOL_GPL(virtqueue_unmap_page_attrs); * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * - * return mapped address. Caller should check that by virtqueue_mapping_error(). + * return mapped address. Caller should check that by virtqueue_map_mapping_error(). */ dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, @@ -3299,7 +3300,7 @@ void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs); /** - * virtqueue_mapping_error - check dma address + * virtqueue_map_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index f9a29045eca0..0a801f67b599 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -80,7 +80,7 @@ static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status) { struct vdpa_device *vdpa = vd_get_vdpa(vdev); - return vdpa_set_status(vdpa, status); + vdpa_set_status(vdpa, status); } static void virtio_vdpa_reset(struct virtio_device *vdev) |
