summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-04-02 20:04:43 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-04-02 20:04:43 -0700
commit01ecadbe09b6c685de413ada8ba6688e9467c4b3 (patch)
tree4281c55f68437c789d1f148c22e9b03f8e601526
parenta1b5bd45d4ee58af4f56e49497b8c3db96d8f8a3 (diff)
parentaae0594a7053c60b82621136257c8b648c67b512 (diff)
Merge tag 'cxl-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
Pull Compute Express Link (CXL) updates from Dave Jiang: - Add support for Global Persistent Flush (GPF) - Cleanup of DPA partition metadata handling: - Remove the CXL_DECODER_MIXED enum that's not needed anymore - Introduce helpers to access resource and perf meta data - Introduce 'struct cxl_dpa_partition' and 'struct cxl_range_info' - Make cxl_dpa_alloc() DPA partition number agnostic - Remove cxl_decoder_mode - Cleanup partition size and perf helpers - Remove unused CXL partition values - Add logging support for CXL CPER endpoint and port protocol errors: - Prefix protocol error struct and function names with cxl_ - Move protocol error definitions and structures to a common location - Remove drivers/firmware/efi/cper_cxl.h to include/linux/cper.h - Add support in GHES to process CXL CPER protocol errors - Process CXL CPER protocol errors - Add trace logging for CXL PCIe port RAS errors - Remove redundant gp_port init - Add validation of cxl device serial number - CXL ABI documentation updates/fixups - A series that uses guard() to clean up open coded mutex lockings and remove gotos for error handling. - Some followup patches to support dirty shutdown accounting: - Add helper to retrieve DVSEC offset for dirty shutdown registers - Rename cxl_get_dirty_shutdown() to cxl_arm_dirty_shutdown() - Add support for dirty shutdown count via sysfs - cxl_test support for dirty shutdown - A series to support CXL mailbox Features commands. Mostly in preparation for CXL EDAC code to utilize the Features commands. It's also in preparation for CXL fwctl support to utilize the CXL Features. The commands include "Get Supported Features", "Get Feature", and "Set Feature". - A series to support extended linear cache support described by the ACPI HMAT table. The addition helps enumerate the cache and also provides additional RAS reporting support for configuration with extended linear cache. (and related fixes for the series). - An update to cxl_test to support a 3-way capable CFMWS - A documentation fix to remove unused "mixed mode" * tag 'cxl-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl: (39 commits) cxl/region: Fix the first aliased address miscalculation cxl/region: Quiet some dev_warn()s in extended linear cache setup cxl/Documentation: Remove 'mixed' from sysfs mode doc cxl: Fix warning from emitting resource_size_t as long long int on 32bit systems cxl/test: Define a CFMWS capable of a 3 way HB interleave cxl/mem: Do not return error if CONFIG_CXL_MCE unset tools/testing/cxl: Set Shutdown State support cxl/pmem: Export dirty shutdown count via sysfs cxl/pmem: Rename cxl_dirty_shutdown_state() cxl/pci: Introduce cxl_gpf_get_dvsec() cxl/pci: Support Global Persistent Flush (GPF) cxl: Document missing sysfs files cxl: Plug typos in ABI doc cxl/pmem: debug invalid serial number data cxl/cdat: Remove redundant gp_port initialization cxl/memdev: Remove unused partition values cxl/region: Drop goto pattern of construct_region() cxl/region: Drop goto pattern in cxl_dax_region_alloc() cxl/core: Use guard() to drop goto pattern of cxl_dpa_alloc() cxl/core: Use guard() to drop the goto pattern of cxl_dpa_free() ...
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node6
-rw-r--r--Documentation/ABI/testing/sysfs-bus-cxl53
-rw-r--r--Documentation/driver-api/cxl/maturity-map.rst2
-rw-r--r--arch/x86/mm/pat/set_memory.c1
-rw-r--r--drivers/acpi/apei/ghes.c103
-rw-r--r--drivers/acpi/numa/hmat.c44
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/cxl/Kconfig4
-rw-r--r--drivers/cxl/core/Makefile3
-rw-r--r--drivers/cxl/core/acpi.c11
-rw-r--r--drivers/cxl/core/cdat.c102
-rw-r--r--drivers/cxl/core/core.h10
-rw-r--r--drivers/cxl/core/hdm.c382
-rw-r--r--drivers/cxl/core/mbox.c141
-rw-r--r--drivers/cxl/core/mce.c65
-rw-r--r--drivers/cxl/core/mce.h20
-rw-r--r--drivers/cxl/core/memdev.c83
-rw-r--r--drivers/cxl/core/pci.c97
-rw-r--r--drivers/cxl/core/port.c38
-rw-r--r--drivers/cxl/core/ras.c119
-rw-r--r--drivers/cxl/core/region.c336
-rw-r--r--drivers/cxl/core/trace.h81
-rw-r--r--drivers/cxl/cxl.h52
-rw-r--r--drivers/cxl/cxlmem.h77
-rw-r--r--drivers/cxl/cxlpci.h6
-rw-r--r--drivers/cxl/mem.c2
-rw-r--r--drivers/cxl/pci.c7
-rw-r--r--drivers/cxl/pmem.c81
-rw-r--r--drivers/firmware/efi/cper.c6
-rw-r--r--drivers/firmware/efi/cper_cxl.c39
-rw-r--r--drivers/firmware/efi/cper_cxl.h66
-rw-r--r--include/cxl/event.h101
-rw-r--r--include/linux/acpi.h11
-rw-r--r--include/linux/cper.h8
-rw-r--r--include/linux/node.h7
-rw-r--r--tools/testing/cxl/Kbuild3
-rw-r--r--tools/testing/cxl/test/cxl.c32
-rw-r--r--tools/testing/cxl/test/mem.c32
38 files changed, 1645 insertions, 588 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 402af4b2b905..a02707cb7cbc 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -177,6 +177,12 @@ Description:
The cache write policy: 0 for write-back, 1 for write-through,
other or unknown.
+What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/address_mode
+Date: March 2025
+Contact: Dave Jiang <dave.jiang@intel.com>
+Description:
+ The address mode: 0 for reserved, 1 for extended-linear.
+
What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes
Date: November 2021
Contact: Jarkko Sakkinen <jarkko@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 3f5627a1210a..99bb3faf7a0e 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -1,5 +1,5 @@
What: /sys/bus/cxl/flush
-Date: Januarry, 2022
+Date: January, 2022
KernelVersion: v5.18
Contact: linux-cxl@vger.kernel.org
Description:
@@ -18,6 +18,24 @@ Description:
specification.
+What: /sys/bus/cxl/devices/memX/payload_max
+Date: December, 2020
+KernelVersion: v5.12
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) Maximum size (in bytes) of the mailbox command payload
+ registers. Linux caps this at 1MB if the device reports a
+ larger size.
+
+
+What: /sys/bus/cxl/devices/memX/label_storage_size
+Date: May, 2021
+KernelVersion: v5.13
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) Size (in bytes) of the Label Storage Area (LSA).
+
+
What: /sys/bus/cxl/devices/memX/ram/size
Date: December, 2020
KernelVersion: v5.12
@@ -33,7 +51,7 @@ Date: May, 2023
KernelVersion: v6.8
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry"
+ (RO) For CXL host platforms that support "QoS Telemetry"
this attribute conveys a comma delimited list of platform
specific cookies that identifies a QoS performance class
for the volatile partition of the CXL mem device. These
@@ -60,7 +78,7 @@ Date: May, 2023
KernelVersion: v6.8
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry"
+ (RO) For CXL host platforms that support "QoS Telemetry"
this attribute conveys a comma delimited list of platform
specific cookies that identifies a QoS performance class
for the persistent partition of the CXL mem device. These
@@ -321,14 +339,13 @@ KernelVersion: v6.0
Contact: linux-cxl@vger.kernel.org
Description:
(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
- translates from a host physical address range, to a device local
- address range. Device-local address ranges are further split
- into a 'ram' (volatile memory) range and 'pmem' (persistent
- memory) range. The 'mode' attribute emits one of 'ram', 'pmem',
- 'mixed', or 'none'. The 'mixed' indication is for error cases
- when a decoder straddles the volatile/persistent partition
- boundary, and 'none' indicates the decoder is not actively
- decoding, or no DPA allocation policy has been set.
+ translates from a host physical address range, to a device
+ local address range. Device-local address ranges are further
+ split into a 'ram' (volatile memory) range and 'pmem'
+ (persistent memory) range. The 'mode' attribute emits one of
+ 'ram', 'pmem', or 'none'. The 'none' indicates the decoder is
+ not actively decoding, or no DPA allocation policy has been
+ set.
'mode' can be written, when the decoder is in the 'disabled'
state, with either 'ram' or 'pmem' to set the boundaries for the
@@ -423,7 +440,7 @@ Date: May, 2023
KernelVersion: v6.5
Contact: linux-cxl@vger.kernel.org
Description:
- (RO) For CXL host platforms that support "QoS Telemmetry" this
+ (RO) For CXL host platforms that support "QoS Telemetry" this
root-decoder-only attribute conveys a platform specific cookie
that identifies a QoS performance class for the CXL Window.
This class-id can be compared against a similar "qos_class"
@@ -586,3 +603,15 @@ Description:
See Documentation/ABI/stable/sysfs-devices-node. access0 provides
the number to the closest initiator and access1 provides the
number to the closest CPU.
+
+
+What: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown
+Date: Feb, 2025
+KernelVersion: v6.15
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The device dirty shutdown count value, which is the number
+ of times the device could have incurred in potential data loss.
+ The count is persistent across power loss and wraps back to 0
+ upon overflow. If this file is not present, the device does not
+ have the necessary support for dirty tracking.
diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst
index df8e2ac2a320..a2288f9df658 100644
--- a/Documentation/driver-api/cxl/maturity-map.rst
+++ b/Documentation/driver-api/cxl/maturity-map.rst
@@ -130,7 +130,7 @@ Mailbox commands
* [0] Switch CCI
* [3] Timestamp
* [1] PMEM labels
-* [0] PMEM GPF / Dirty Shutdown
+* [3] PMEM GPF / Dirty Shutdown
* [0] Scan Media
PMU
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 72405d315b41..def3d9284254 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2274,6 +2274,7 @@ int set_mce_nospec(unsigned long pfn)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
}
+EXPORT_SYMBOL_GPL(set_mce_nospec);
/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index b72772494655..289e365f84b2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -674,6 +674,105 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}
+/* Room for 8 entries */
+#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
+static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
+ CXL_CPER_PROT_ERR_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
+static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
+struct work_struct *cxl_cper_prot_err_work;
+
+static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
+ int severity)
+{
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+ struct cxl_cper_prot_err_work_data wd;
+ u8 *dvsec_start, *cap_start;
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
+ pr_err_ratelimited("CXL CPER invalid agent type\n");
+ return;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+ pr_err_ratelimited("CXL CPER invalid protocol error log\n");
+ return;
+ }
+
+ if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
+ pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
+ prot_err->err_len);
+ return;
+ }
+
+ if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
+ pr_warn(FW_WARN "CXL CPER no device serial number\n");
+
+ guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
+
+ if (!cxl_cper_prot_err_work)
+ return;
+
+ switch (prot_err->agent_type) {
+ case RCD:
+ case DEVICE:
+ case LD:
+ case FMLD:
+ case RP:
+ case DSP:
+ case USP:
+ memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
+
+ dvsec_start = (u8 *)(prot_err + 1);
+ cap_start = dvsec_start + prot_err->dvsec_len;
+
+ memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap));
+ wd.severity = cper_severity_to_aer(severity);
+ break;
+ default:
+ pr_err_ratelimited("CXL CPER invalid agent type: %d\n",
+ prot_err->agent_type);
+ return;
+ }
+
+ if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
+ pr_err_ratelimited("CXL CPER kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_cper_prot_err_work);
+#endif
+}
+
+int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = work;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
+
+int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work != work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
+
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return kfifo_get(&cxl_cper_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
+
/* Room for 8 entries for each of the 4 event log queues */
#define CXL_CPER_FIFO_DEPTH 32
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
@@ -777,6 +876,10 @@ static bool ghes_do_proc(struct ghes *ghes,
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+ struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+
+ cxl_cper_post_prot_err(prot_err, gdata->error_severity);
} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index bfbb08b1e6af..9d9052258e92 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -108,6 +108,45 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm)
return NULL;
}
+/**
+ * hmat_get_extended_linear_cache_size - Retrieve the extended linear cache size
+ * @backing_res: resource from the backing media
+ * @nid: node id for the memory region
+ * @cache_size: (Output) size of extended linear cache.
+ *
+ * Return: 0 on success. Errno on failure.
+ *
+ */
+int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
+ resource_size_t *cache_size)
+{
+ unsigned int pxm = node_to_pxm(nid);
+ struct memory_target *target;
+ struct target_cache *tcache;
+ struct resource *res;
+
+ target = find_mem_target(pxm);
+ if (!target)
+ return -ENOENT;
+
+ list_for_each_entry(tcache, &target->caches, node) {
+ if (tcache->cache_attrs.address_mode !=
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR)
+ continue;
+
+ res = &target->memregions;
+ if (!resource_contains(res, backing_res))
+ continue;
+
+ *cache_size = tcache->cache_attrs.size;
+ return 0;
+ }
+
+ *cache_size = 0;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(hmat_get_extended_linear_cache_size, "CXL");
+
static struct memory_target *acpi_find_genport_target(u32 uid)
{
struct memory_target *target;
@@ -506,6 +545,11 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header,
switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) {
case ACPI_HMAT_CA_DIRECT_MAPPED:
tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP;
+ /* Extended Linear mode is only valid if cache is direct mapped */
+ if (cache->address_mode == ACPI_HMAT_CACHE_MODE_EXTENDED_LINEAR) {
+ tcache->cache_attrs.address_mode =
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR;
+ }
break;
case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING:
tcache->cache_attrs.indexing = NODE_CACHE_INDEXED;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0ea653fa3433..cd13ef287011 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -244,12 +244,14 @@ CACHE_ATTR(size, "%llu")
CACHE_ATTR(line_size, "%u")
CACHE_ATTR(indexing, "%u")
CACHE_ATTR(write_policy, "%u")
+CACHE_ATTR(address_mode, "%#x")
static struct attribute *cache_attrs[] = {
&dev_attr_indexing.attr,
&dev_attr_size.attr,
&dev_attr_line_size.attr,
&dev_attr_write_policy.attr,
+ &dev_attr_address_mode.attr,
NULL,
};
ATTRIBUTE_GROUPS(cache);
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 8ac1e9d70eeb..cf1ba673b8c2 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -158,4 +158,8 @@ config CXL_REGION_INVALIDATION_TEST
If unsure, or if this kernel is meant for production environments,
say N.
+config CXL_MCE
+ def_bool y
+ depends on X86_MCE && MEMORY_FAILURE
+
endif
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index b0bfbd9eac9b..086df97a0fcf 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -14,6 +14,9 @@ cxl_core-y += pci.o
cxl_core-y += hdm.o
cxl_core-y += pmu.o
cxl_core-y += cdat.o
+cxl_core-y += ras.o
+cxl_core-y += acpi.o
cxl_core-$(CONFIG_TRACING) += trace.o
cxl_core-$(CONFIG_CXL_REGION) += region.o
+cxl_core-$(CONFIG_CXL_MCE) += mce.o
cxl_core-$(CONFIG_CXL_FEATURES) += features.o
diff --git a/drivers/cxl/core/acpi.c b/drivers/cxl/core/acpi.c
new file mode 100644
index 000000000000..f13b4dae6ac5
--- /dev/null
+++ b/drivers/cxl/core/acpi.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/acpi.h>
+#include "cxl.h"
+#include "core.h"
+
+int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size)
+{
+ return hmat_get_extended_linear_cache_size(backing_res, nid, size);
+}
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index 8153f8d83a16..edb4f41eeacc 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -258,27 +258,29 @@ static void update_perf_entry(struct device *dev, struct dsmas_entry *dent,
static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds,
struct xarray *dsmas_xa)
{
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct device *dev = cxlds->dev;
- struct range pmem_range = {
- .start = cxlds->pmem_res.start,
- .end = cxlds->pmem_res.end,
- };
- struct range ram_range = {
- .start = cxlds->ram_res.start,
- .end = cxlds->ram_res.end,
- };
struct dsmas_entry *dent;
unsigned long index;
xa_for_each(dsmas_xa, index, dent) {
- if (resource_size(&cxlds->ram_res) &&
- range_contains(&ram_range, &dent->dpa_range))
- update_perf_entry(dev, dent, &mds->ram_perf);
- else if (resource_size(&cxlds->pmem_res) &&
- range_contains(&pmem_range, &dent->dpa_range))
- update_perf_entry(dev, dent, &mds->pmem_perf);
- else
+ bool found = false;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct resource *res = &cxlds->part[i].res;
+ struct range range = {
+ .start = res->start,
+ .end = res->end,
+ };
+
+ if (range_contains(&range, &dent->dpa_range)) {
+ update_perf_entry(dev, dent,
+ &cxlds->part[i].perf);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
dev_dbg(dev, "no partition for dsmas dpa: %pra\n",
&dent->dpa_range);
}
@@ -343,36 +345,46 @@ static int match_cxlrd_hb(struct device *dev, void *data)
return 0;
}
-static int cxl_qos_class_verify(struct cxl_memdev *cxlmd)
+static void cxl_qos_class_verify(struct cxl_memdev *cxlmd)
{
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct cxl_port *root_port;
- int rc;
struct cxl_root *cxl_root __free(put_cxl_root) =
find_cxl_root(cxlmd->endpoint);
+ /*
+ * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to
+ * succeed when called in the cxl_endpoint_port_probe() path.
+ */
if (!cxl_root)
- return -ENODEV;
+ return;
root_port = &cxl_root->port;
- /* Check that the QTG IDs are all sane between end device and root decoders */
- if (!cxl_qos_match(root_port, &mds->ram_perf))
- reset_dpa_perf(&mds->ram_perf);
- if (!cxl_qos_match(root_port, &mds->pmem_perf))
- reset_dpa_perf(&mds->pmem_perf);
-
- /* Check to make sure that the device's host bridge is under a root decoder */
- rc = device_for_each_child(&root_port->dev,
- cxlmd->endpoint->host_bridge, match_cxlrd_hb);
- if (!rc) {
- reset_dpa_perf(&mds->ram_perf);
- reset_dpa_perf(&mds->pmem_perf);
+ /*
+ * Save userspace from needing to check if a qos class has any matches
+ * by hiding qos class info if the memdev is not mapped by a root
+ * decoder, or the partition class does not match any root decoder
+ * class.
+ */
+ if (!device_for_each_child(&root_port->dev,
+ cxlmd->endpoint->host_bridge,
+ match_cxlrd_hb)) {
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ reset_dpa_perf(perf);
+ }
+ return;
}
- return rc;
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ if (!cxl_qos_match(root_port, perf))
+ reset_dpa_perf(perf);
+ }
}
static void discard_dsmas(struct xarray *xa)
@@ -570,23 +582,18 @@ static bool dpa_perf_contains(struct cxl_dpa_perf *perf,
return range_contains(&perf->dpa_range, &dpa);
}
-static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode)
+static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_dpa_perf *perf;
- switch (mode) {
- case CXL_DECODER_RAM:
- perf = &mds->ram_perf;
- break;
- case CXL_DECODER_PMEM:
- perf = &mds->pmem_perf;
- break;
- default:
+ if (cxled->part < 0)
+ return ERR_PTR(-EINVAL);
+ perf = &cxlds->part[cxled->part].perf;
+
+ if (!perf)
return ERR_PTR(-EINVAL);
- }
if (!dpa_perf_contains(perf, cxled->dpa_res))
return ERR_PTR(-EINVAL);
@@ -647,11 +654,10 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
if (cxlds->rcd)
return -ENODEV;
- perf = cxled_get_dpa_perf(cxled, cxlr->mode);
+ perf = cxled_get_dpa_perf(cxled);
if (IS_ERR(perf))
return PTR_ERR(perf);
- gp_port = to_cxl_port(parent_port->dev.parent);
*gp_is_root = is_cxl_root(gp_port);
/*
@@ -1053,7 +1059,7 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
lockdep_assert_held(&cxl_dpa_rwsem);
- perf = cxled_get_dpa_perf(cxled, cxlr->mode);
+ perf = cxled_get_dpa_perf(cxled);
if (IS_ERR(perf))
return;
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 17e99a25c29a..15699299dc11 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -74,8 +74,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
resource_size_t length);
struct dentry *cxl_debugfs_create_dir(const char *dir);
-int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode);
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode);
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size);
int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
@@ -117,6 +117,12 @@ bool cxl_need_node_perf_attrs_update(int nid);
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
struct access_coordinate *c);
+int cxl_ras_init(void);
+void cxl_ras_exit(void);
+int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);
+int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size);
+
#ifdef CONFIG_CXL_FEATURES
size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
enum cxl_get_feat_selection selection,
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 50e6a45b30ba..70cae4ebf8a4 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -213,16 +213,46 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
{
struct resource *p1, *p2;
- down_read(&cxl_dpa_rwsem);
+ guard(rwsem_read)(&cxl_dpa_rwsem);
for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
__cxl_dpa_debug(file, p1, 0);
for (p2 = p1->child; p2; p2 = p2->sibling)
__cxl_dpa_debug(file, p2, 1);
}
- up_read(&cxl_dpa_rwsem);
}
EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
+/* See request_skip() kernel-doc */
+static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len,
+ const char *requester)
+{
+ const resource_size_t skip_end = skip_base + skip_len - 1;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *part_res = &cxlds->part[i].res;
+ resource_size_t adjust_start, adjust_end, size;
+
+ adjust_start = max(skip_base, part_res->start);
+ adjust_end = min(skip_end, part_res->end);
+
+ if (adjust_end < adjust_start)
+ continue;
+
+ size = adjust_end - adjust_start + 1;
+
+ if (!requester)
+ __release_region(&cxlds->dpa_res, adjust_start, size);
+ else if (!__request_region(&cxlds->dpa_res, adjust_start, size,
+ requester, 0))
+ return adjust_start - skip_base;
+ }
+
+ return skip_len;
+}
+#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL)
+
/*
* Must be called in a context that synchronizes against this decoder's
* port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -241,7 +271,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
skip_start = res->start - cxled->skip;
__release_region(&cxlds->dpa_res, res->start, resource_size(res));
if (cxled->skip)
- __release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+ release_skip(cxlds, skip_start, cxled->skip);
cxled->skip = 0;
cxled->dpa_res = NULL;
put_device(&cxled->cxld.dev);
@@ -250,9 +280,8 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
static void cxl_dpa_release(void *cxled)
{
- down_write(&cxl_dpa_rwsem);
+ guard(rwsem_write)(&cxl_dpa_rwsem);
__cxl_dpa_release(cxled);
- up_write(&cxl_dpa_rwsem);
}
/*
@@ -268,6 +297,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
__cxl_dpa_release(cxled);
}
+/**
+ * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree
+ * @cxlds: CXL.mem device context that parents @cxled
+ * @cxled: Endpoint decoder establishing new allocation that skips lower DPA
+ * @skip_base: DPA < start of new DPA allocation (DPAnew)
+ * @skip_len: @skip_base + @skip_len == DPAnew
+ *
+ * DPA 'skip' arises from out-of-sequence DPA allocation events relative
+ * to free capacity across multiple partitions. It is a wasteful event
+ * as usable DPA gets thrown away, but if a deployment has, for example,
+ * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM
+ * DPA, the free RAM DPA must be sacrificed to start allocating PMEM.
+ * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder
+ * Protection" for more details.
+ *
+ * A 'skip' always covers the last allocated DPA in a previous partition
+ * to the start of the current partition to allocate. Allocations never
+ * start in the middle of a partition, and allocations are always
+ * de-allocated in reverse order (see cxl_dpa_free(), or natural devm
+ * unwind order from forced in-order allocation).
+ *
+ * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip'
+ * would always be contained to a single partition. Given
+ * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip'
+ * might span "tail capacity of partition[0], all of partition[1], ...,
+ * all of partition[N-1]" to support allocating from partition[N]. That
+ * in turn interacts with the partition 'struct resource' boundaries
+ * within @cxlds->dpa_res whereby 'skip' requests need to be divided by
+ * partition. I.e. this is a quirk of using a 'struct resource' tree to
+ * detect range conflicts while also tracking partition boundaries in
+ * @cxlds->dpa_res.
+ */
+static int request_skip(struct cxl_dev_state *cxlds,
+ struct cxl_endpoint_decoder *cxled,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len)
+{
+ resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len,
+ dev_name(&cxled->cxld.dev));
+
+ if (skipped == skip_len)
+ return 0;
+
+ dev_dbg(cxlds->dev,
+ "%s: failed to reserve skipped space (%pa %pa %pa)\n",
+ dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped);
+
+ release_skip(cxlds, skip_base, skipped);
+
+ return -EBUSY;
+}
+
static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped)
@@ -277,6 +358,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &port->dev;
struct resource *res;
+ int rc;
lockdep_assert_held_write(&cxl_dpa_rwsem);
@@ -305,14 +387,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
}
if (skipped) {
- res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
- dev_name(&cxled->cxld.dev), 0);
- if (!res) {
- dev_dbg(dev,
- "decoder%d.%d: failed to reserve skipped space\n",
- port->id, cxled->cxld.id);
- return -EBUSY;
- }
+ rc = request_skip(cxlds, cxled, base - skipped, skipped);
+ if (rc)
+ return rc;
}
res = __request_region(&cxlds->dpa_res, base, len,
dev_name(&cxled->cxld.dev), 0);
@@ -320,28 +397,117 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
port->id, cxled->cxld.id);
if (skipped)
- __release_region(&cxlds->dpa_res, base - skipped,
- skipped);
+ release_skip(cxlds, base - skipped, skipped);
return -EBUSY;
}
cxled->dpa_res = res;
cxled->skip = skipped;
- if (resource_contains(&cxlds->pmem_res, res))
- cxled->mode = CXL_DECODER_PMEM;
- else if (resource_contains(&cxlds->ram_res, res))
- cxled->mode = CXL_DECODER_RAM;
- else {
- dev_warn(dev, "decoder%d.%d: %pr mixed mode not supported\n",
- port->id, cxled->cxld.id, cxled->dpa_res);
- cxled->mode = CXL_DECODER_MIXED;
- }
+ /*
+ * When allocating new capacity, ->part is already set, when
+ * discovering decoder settings at initial enumeration, ->part
+ * is not set.
+ */
+ if (cxled->part < 0)
+ for (int i = 0; cxlds->nr_partitions; i++)
+ if (resource_contains(&cxlds->part[i].res, res)) {
+ cxled->part = i;
+ break;
+ }
+
+ if (cxled->part < 0)
+ dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
+ port->id, cxled->cxld.id, res);
port->hdm_end++;
get_device(&cxled->cxld.dev);
return 0;
}
+static int add_dpa_res(struct device *dev, struct resource *parent,
+ struct resource *res, resource_size_t start,
+ resource_size_t size, const char *type)
+{
+ int rc;
+
+ *res = (struct resource) {
+ .name = type,
+ .start = start,
+ .end = start + size - 1,
+ .flags = IORESOURCE_MEM,
+ };
+ if (resource_size(res) == 0) {
+ dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
+ return 0;
+ }
+ rc = request_resource(parent, res);
+ if (rc) {
+ dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
+ res, rc);
+ return rc;
+ }
+
+ dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+
+ return 0;
+}
+
+static const char *cxl_mode_name(enum cxl_partition_mode mode)
+{
+ switch (mode) {
+ case CXL_PARTMODE_RAM:
+ return "ram";
+ case CXL_PARTMODE_PMEM:
+ return "pmem";
+ default:
+ return "";
+ };
+}
+
+/* if this fails the caller must destroy @cxlds, there is no recovery */
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info)
+{
+ struct device *dev = cxlds->dev;
+
+ guard(rwsem_write)(&cxl_dpa_rwsem);
+
+ if (cxlds->nr_partitions)
+ return -EBUSY;
+
+ if (!info->size || !info->nr_partitions) {
+ cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
+ cxlds->nr_partitions = 0;
+ return 0;
+ }
+
+ cxlds->dpa_res = DEFINE_RES_MEM(0, info->size);
+
+ for (int i = 0; i < info->nr_partitions; i++) {
+ const struct cxl_dpa_part_info *part = &info->part[i];
+ int rc;
+
+ cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID;
+ cxlds->part[i].mode = part->mode;
+
+ /* Require ordered + contiguous partitions */
+ if (i) {
+ const struct cxl_dpa_part_info *prev = &info->part[i - 1];
+
+ if (prev->range.end + 1 != part->range.start)
+ return -EINVAL;
+ }
+ rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res,
+ part->range.start, range_len(&part->range),
+ cxl_mode_name(part->mode));
+ if (rc)
+ return rc;
+ cxlds->nr_partitions++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_dpa_setup);
+
int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped)
@@ -362,14 +528,11 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL");
resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
{
- resource_size_t size = 0;
-
- down_read(&cxl_dpa_rwsem);
+ guard(rwsem_read)(&cxl_dpa_rwsem);
if (cxled->dpa_res)
- size = resource_size(cxled->dpa_res);
- up_read(&cxl_dpa_rwsem);
+ return resource_size(cxled->dpa_res);
- return size;
+ return 0;
}
resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
@@ -387,151 +550,136 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
{
struct cxl_port *port = cxled_to_port(cxled);
struct device *dev = &cxled->cxld.dev;
- int rc;
- down_write(&cxl_dpa_rwsem);
- if (!cxled->dpa_res) {
- rc = 0;
- goto out;
- }
+ guard(rwsem_write)(&cxl_dpa_rwsem);
+ if (!cxled->dpa_res)
+ return 0;
if (cxled->cxld.region) {
dev_dbg(dev, "decoder assigned to: %s\n",
dev_name(&cxled->cxld.region->dev));
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
dev_dbg(dev, "decoder enabled\n");
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.id != port->hdm_end) {
dev_dbg(dev, "expected decoder%d.%d\n", port->id,
port->hdm_end);
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
+
devm_cxl_dpa_release(cxled);
- rc = 0;
-out:
- up_write(&cxl_dpa_rwsem);
- return rc;
+ return 0;
}
-int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
- enum cxl_decoder_mode mode)
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &cxled->cxld.dev;
-
- switch (mode) {
- case CXL_DECODER_RAM:
- case CXL_DECODER_PMEM:
- break;
- default:
- dev_dbg(dev, "unsupported mode: %d\n", mode);
- return -EINVAL;
- }
+ int part;
guard(rwsem_write)(&cxl_dpa_rwsem);
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE)
return -EBUSY;
- /*
- * Only allow modes that are supported by the current partition
- * configuration
- */
- if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) {
- dev_dbg(dev, "no available pmem capacity\n");
- return -ENXIO;
+ for (part = 0; part < cxlds->nr_partitions; part++)
+ if (cxlds->part[part].mode == mode)
+ break;
+
+ if (part >= cxlds->nr_partitions) {
+ dev_dbg(dev, "unsupported mode: %d\n", mode);
+ return -EINVAL;
}
- if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) {
- dev_dbg(dev, "no available ram capacity\n");
+
+ if (!resource_size(&cxlds->part[part].res)) {
+ dev_dbg(dev, "no available capacity for mode: %d\n", mode);
return -ENXIO;
}
- cxled->mode = mode;
+ cxled->part = part;
return 0;
}
-int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- resource_size_t free_ram_start, free_pmem_start;
- struct cxl_port *port = cxled_to_port(cxled);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct device *dev = &cxled->cxld.dev;
- resource_size_t start, avail, skip;
+ struct resource *res, *prev = NULL;
+ resource_size_t start, avail, skip, skip_start;
struct resource *p, *last;
- int rc;
+ int part;
- down_write(&cxl_dpa_rwsem);
+ guard(rwsem_write)(&cxl_dpa_rwsem);
if (cxled->cxld.region) {
dev_dbg(dev, "decoder attached to %s\n",
dev_name(&cxled->cxld.region->dev));
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
dev_dbg(dev, "decoder enabled\n");
- rc = -EBUSY;
- goto out;
+ return -EBUSY;
}
- for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling)
- last = p;
- if (last)
- free_ram_start = last->end + 1;
- else
- free_ram_start = cxlds->ram_res.start;
+ part = cxled->part;
+ if (part < 0) {
+ dev_dbg(dev, "partition not set\n");
+ return -EBUSY;
+ }
- for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling)
+ res = &cxlds->part[part].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
last = p;
if (last)
- free_pmem_start = last->end + 1;
+ start = last->end + 1;
else
- free_pmem_start = cxlds->pmem_res.start;
-
- if (cxled->mode == CXL_DECODER_RAM) {
- start = free_ram_start;
- avail = cxlds->ram_res.end - start + 1;
- skip = 0;
- } else if (cxled->mode == CXL_DECODER_PMEM) {
- resource_size_t skip_start, skip_end;
-
- start = free_pmem_start;
- avail = cxlds->pmem_res.end - start + 1;
- skip_start = free_ram_start;
+ start = res->start;
- /*
- * If some pmem is already allocated, then that allocation
- * already handled the skip.
- */
- if (cxlds->pmem_res.child &&
- skip_start == cxlds->pmem_res.child->start)
- skip_end = skip_start - 1;
- else
- skip_end = start - 1;
- skip = skip_end - skip_start + 1;
- } else {
- dev_dbg(dev, "mode not set\n");
- rc = -EINVAL;
- goto out;
+ /*
+ * To allocate at partition N, a skip needs to be calculated for all
+ * unallocated space at lower partitions indices.
+ *
+ * If a partition has any allocations, the search can end because a
+ * previous cxl_dpa_alloc() invocation is assumed to have accounted for
+ * all previous partitions.
+ */
+ skip_start = CXL_RESOURCE_NONE;
+ for (int i = part; i; i--) {
+ prev = &cxlds->part[i - 1].res;
+ for (p = prev->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last) {
+ skip_start = last->end + 1;
+ break;
+ }
+ skip_start = prev->start;
}
+ avail = res->end - start + 1;
+ if (skip_start == CXL_RESOURCE_NONE)
+ skip = 0;
+ else
+ skip = res->start - skip_start;
+
if (size > avail) {
dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
- cxl_decoder_mode_name(cxled->mode), &avail);
- rc = -ENOSPC;
- goto out;
+ res->name, &avail);
+ return -ENOSPC;
}
- rc = __cxl_dpa_reserve(cxled, start, size, skip);
-out:
- up_write(&cxl_dpa_rwsem);
+ return __cxl_dpa_reserve(cxled, start, size, skip);
+}
+
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+ int rc;
+ rc = __cxl_dpa_alloc(cxled, size);
if (rc)
return rc;
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 78c5346e3e89..d72764056ce6 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -11,6 +11,7 @@
#include "core.h"
#include "trace.h"
+#include "mce.h"
static bool cxl_raw_allow_all;
@@ -900,7 +901,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
}
if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
- u64 dpa, hpa = ULLONG_MAX;
+ u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX;
struct cxl_region *cxlr;
/*
@@ -913,14 +914,20 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
cxlr = cxl_dpa_to_region(cxlmd, dpa);
- if (cxlr)
+ if (cxlr) {
+ u64 cache_size = cxlr->params.cache_size;
+
hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa);
+ if (cache_size)
+ hpa_alias = hpa - cache_size;
+ }
if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
trace_cxl_general_media(cxlmd, type, cxlr, hpa,
- &evt->gen_media);
+ hpa_alias, &evt->gen_media);
else if (event_type == CXL_CPER_EVENT_DRAM)
- trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram);
+ trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias,
+ &evt->dram);
}
}
EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL");
@@ -1126,10 +1133,6 @@ static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
mds->active_persistent_bytes =
le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER;
- mds->next_volatile_bytes =
- le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
- mds->next_persistent_bytes =
- le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
return 0;
}
@@ -1251,74 +1254,54 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
{
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
struct cxl_port *endpoint;
- int rc;
/* synchronize with cxl_mem_probe() and decoder write operations */
guard(device)(&cxlmd->dev);
endpoint = cxlmd->endpoint;
- down_read(&cxl_region_rwsem);
+ guard(rwsem_read)(&cxl_region_rwsem);
/*
* Require an endpoint to be safe otherwise the driver can not
* be sure that the device is unmapped.
*/
if (endpoint && cxl_num_decoders_committed(endpoint) == 0)
- rc = __cxl_mem_sanitize(mds, cmd);
- else
- rc = -EBUSY;
- up_read(&cxl_region_rwsem);
+ return __cxl_mem_sanitize(mds, cmd);
- return rc;
+ return -EBUSY;
}
-static int add_dpa_res(struct device *dev, struct resource *parent,
- struct resource *res, resource_size_t start,
- resource_size_t size, const char *type)
+static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
{
- int rc;
-
- res->name = type;
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_MEM;
- if (resource_size(res) == 0) {
- dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
- return 0;
- }
- rc = request_resource(parent, res);
- if (rc) {
- dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
- res, rc);
- return rc;
- }
+ int i = info->nr_partitions;
- dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+ if (size == 0)
+ return;
- return 0;
+ info->part[i].range = (struct range) {
+ .start = start,
+ .end = start + size - 1,
+ };
+ info->part[i].mode = mode;
+ info->nr_partitions++;
}
-int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
{
struct cxl_dev_state *cxlds = &mds->cxlds;
struct device *dev = cxlds->dev;
int rc;
if (!cxlds->media_ready) {
- cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
- cxlds->ram_res = DEFINE_RES_MEM(0, 0);
- cxlds->pmem_res = DEFINE_RES_MEM(0, 0);
+ info->size = 0;
return 0;
}
- cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes);
+ info->size = mds->total_bytes;
if (mds->partition_align_bytes == 0) {
- rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
- mds->volatile_only_bytes, "ram");
- if (rc)
- return rc;
- return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
- mds->volatile_only_bytes,
- mds->persistent_only_bytes, "pmem");
+ add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->volatile_only_bytes,
+ mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
+ return 0;
}
rc = cxl_mem_get_partition_info(mds);
@@ -1327,15 +1310,52 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
return rc;
}
- rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
- mds->active_volatile_bytes, "ram");
- if (rc)
- return rc;
- return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
- mds->active_volatile_bytes,
- mds->active_persistent_bytes, "pmem");
+ add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
+ CXL_PARTMODE_PMEM);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
+
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_health_info_out hi;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_HEALTH_INFO,
+ .size_out = sizeof(hi),
+ .payload_out = &hi,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (!rc)
+ *count = le32_to_cpu(hi.dirty_shutdown_cnt);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL");
+
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_mbox_set_shutdown_state_in in = {
+ .state = 1
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+ .size_in = sizeof(in),
+ .payload_in = &in,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
}
-EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL");
+EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL");
int cxl_set_timestamp(struct cxl_memdev_state *mds)
{
@@ -1467,6 +1487,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL");
struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
{
struct cxl_memdev_state *mds;
+ int rc;
mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL);
if (!mds) {
@@ -1480,8 +1501,12 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
mds->cxlds.cxl_mbox.host = dev;
mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE;
mds->cxlds.type = CXL_DEVTYPE_CLASSMEM;
- mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
- mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
+
+ rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier);
+ if (rc == -EOPNOTSUPP)
+ dev_warn(dev, "CXL MCE unsupported\n");
+ else if (rc)
+ return ERR_PTR(rc);
return mds;
}
diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c
new file mode 100644
index 000000000000..ff8d078c6ca1
--- /dev/null
+++ b/drivers/cxl/core/mce.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/set_memory.h>
+#include <asm/mce.h>
+#include <cxlmem.h>
+#include "mce.h"
+
+static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
+ mce_notifier);
+ struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+ struct cxl_port *endpoint = cxlmd->endpoint;
+ struct mce *mce = data;
+ u64 spa, spa_alias;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (!endpoint)
+ return NOTIFY_DONE;
+
+ spa = mce->addr & MCI_ADDR_PHYSADDR;
+
+ pfn = spa >> PAGE_SHIFT;
+ if (!pfn_valid(pfn))
+ return NOTIFY_DONE;
+
+ spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa);
+ if (spa_alias == ~0ULL)
+ return NOTIFY_DONE;
+
+ pfn = spa_alias >> PAGE_SHIFT;
+
+ /*
+ * Take down the aliased memory page. The original memory page flagged
+ * by the MCE will be taken cared of by the standard MCE handler.
+ */
+ dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n",
+ spa_alias);
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+
+ return NOTIFY_OK;
+}
+
+static void cxl_unregister_mce_notifier(void *mce_notifier)
+{
+ mce_unregister_decode_chain(mce_notifier);
+}
+
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ mce_notifier->notifier_call = cxl_handle_mce;
+ mce_notifier->priority = MCE_PRIO_UC;
+ mce_register_decode_chain(mce_notifier);
+
+ return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier,
+ mce_notifier);
+}
diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h
new file mode 100644
index 000000000000..ace73424eeb6
--- /dev/null
+++ b/drivers/cxl/core/mce.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#ifndef _CXL_CORE_MCE_H_
+#define _CXL_CORE_MCE_H_
+
+#include <linux/notifier.h>
+
+#ifdef CONFIG_CXL_MCE
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifer);
+#else
+static inline int
+devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 2e2e035abdaa..a16a5886d40a 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -75,12 +75,20 @@ static ssize_t label_storage_size_show(struct device *dev,
}
static DEVICE_ATTR_RO(label_storage_size);
+static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds)
+{
+ /* Static RAM is only expected at partition 0. */
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return 0;
+ return resource_size(&cxlds->part[0].res);
+}
+
static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- unsigned long long len = resource_size(&cxlds->ram_res);
+ unsigned long long len = cxl_ram_size(cxlds);
return sysfs_emit(buf, "%#llx\n", len);
}
@@ -93,7 +101,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- unsigned long long len = resource_size(&cxlds->pmem_res);
+ unsigned long long len = cxl_pmem_size(cxlds);
return sysfs_emit(buf, "%#llx\n", len);
}
@@ -198,22 +206,17 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd)
int rc = 0;
/* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */
- if (resource_size(&cxlds->pmem_res)) {
- offset = cxlds->pmem_res.start;
- length = resource_size(&cxlds->pmem_res);
- rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc)
- return rc;
- }
- if (resource_size(&cxlds->ram_res)) {
- offset = cxlds->ram_res.start;
- length = resource_size(&cxlds->ram_res);
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *res = &cxlds->part[i].res;
+
+ offset = res->start;
+ length = resource_size(res);
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
/*
* Invalid Physical Address is not an error for
* volatile addresses. Device support is optional.
*/
- if (rc == -EFAULT)
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
rc = 0;
}
return rc;
@@ -404,14 +407,21 @@ static struct attribute *cxl_memdev_attributes[] = {
NULL,
};
+static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds)
+{
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return &cxlds->part[i].perf;
+ return NULL;
+}
+
static ssize_t pmem_qos_class_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
- return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class);
+ return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class);
}
static struct device_attribute dev_attr_pmem_qos_class =
@@ -423,14 +433,20 @@ static struct attribute *cxl_memdev_pmem_attributes[] = {
NULL,
};
+static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds)
+{
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return NULL;
+ return &cxlds->part[0].perf;
+}
+
static ssize_t ram_qos_class_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
- return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class);
+ return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class);
}
static struct device_attribute dev_attr_ram_qos_class =
@@ -466,11 +482,11 @@ static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n)
{
struct device *dev = kobj_to_dev(kobj);
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds);
- if (a == &dev_attr_ram_qos_class.attr)
- if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID)
- return 0;
+ if (a == &dev_attr_ram_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
return a->mode;
}
@@ -485,11 +501,11 @@ static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n
{
struct device *dev = kobj_to_dev(kobj);
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds);
- if (a == &dev_attr_pmem_qos_class.attr)
- if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID)
- return 0;
+ if (a == &dev_attr_pmem_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
return a->mode;
}
@@ -566,10 +582,9 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
bitmap_or(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
cmds, CXL_MEM_COMMAND_ID_MAX);
- up_write(&cxl_memdev_rwsem);
}
EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL");
@@ -583,10 +598,9 @@ void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
bitmap_andnot(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
cmds, CXL_MEM_COMMAND_ID_MAX);
- up_write(&cxl_memdev_rwsem);
}
EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL");
@@ -594,9 +608,8 @@ static void cxl_memdev_shutdown(struct device *dev)
{
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- down_write(&cxl_memdev_rwsem);
+ guard(rwsem_write)(&cxl_memdev_rwsem);
cxlmd->cxlds = NULL;
- up_write(&cxl_memdev_rwsem);
}
static void cxl_memdev_unregister(void *_cxlmd)
@@ -678,15 +691,13 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
{
struct cxl_memdev *cxlmd = file->private_data;
struct cxl_dev_state *cxlds;
- int rc = -ENXIO;
- down_read(&cxl_memdev_rwsem);
+ guard(rwsem_read)(&cxl_memdev_rwsem);
cxlds = cxlmd->cxlds;
if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM)
- rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
- up_read(&cxl_memdev_rwsem);
+ return __cxl_memdev_ioctl(cxlmd, cmd, arg);
- return rc;
+ return -ENXIO;
}
static int cxl_memdev_open(struct inode *inode, struct file *file)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 013b869b66cb..96fecb799cbc 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1054,3 +1054,100 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
return 0;
}
+
+/*
+ * Set max timeout such that platforms will optimize GPF flow to avoid
+ * the implied worst-case scenario delays. On a sane platform, all
+ * devices should always complete GPF within the energy budget of
+ * the GPF flow. The kernel does not have enough information to pick
+ * anything better than "maximize timeouts and hope it works".
+ *
+ * A misbehaving device could block forward progress of GPF for all
+ * the other devices, exhausting the energy budget of the platform.
+ * However, the spec seems to assume that moving on from slow to respond
+ * devices is a virtue. It is not possible to know that, in actuality,
+ * the slow to respond device is *the* most critical device in the
+ * system to wait.
+ */
+#define GPF_TIMEOUT_BASE_MAX 2
+#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
+
+u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port)
+{
+ u16 dvsec;
+
+ if (!dev_is_pci(dev))
+ return 0;
+
+ dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL,
+ is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF);
+ if (!dvsec)
+ dev_warn(dev, "%s GPF DVSEC not present\n",
+ is_port ? "Port" : "Device");
+ return dvsec;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL");
+
+static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
+{
+ u64 base, scale;
+ int rc, offset;
+ u16 ctrl;
+
+ switch (phase) {
+ case 1:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
+ break;
+ case 2:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
+ if (rc)
+ return rc;
+
+ if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
+ FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
+ return 0;
+
+ ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
+ ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);
+
+ rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
+ if (!rc)
+ pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
+ phase, GPF_TIMEOUT_BASE_MAX);
+
+ return rc;
+}
+
+int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port)
+{
+ struct pci_dev *pdev;
+
+ if (!port)
+ return -EINVAL;
+
+ if (!port->gpf_dvsec) {
+ int dvsec;
+
+ dvsec = cxl_gpf_get_dvsec(dport_dev, true);
+ if (!dvsec)
+ return -EINVAL;
+
+ port->gpf_dvsec = dvsec;
+ }
+
+ pdev = to_pci_dev(dport_dev);
+ update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1);
+ update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2);
+
+ return 0;
+}
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 78a5c2c25982..0fd6646c1a2e 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -194,25 +194,35 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ /* without @cxl_dpa_rwsem, make sure @part is not reloaded */
+ int part = READ_ONCE(cxled->part);
+ const char *desc;
+
+ if (part < 0)
+ desc = "none";
+ else
+ desc = cxlds->part[part].res.name;
- return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxled->mode));
+ return sysfs_emit(buf, "%s\n", desc);
}
static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
- enum cxl_decoder_mode mode;
+ enum cxl_partition_mode mode;
ssize_t rc;
if (sysfs_streq(buf, "pmem"))
- mode = CXL_DECODER_PMEM;
+ mode = CXL_PARTMODE_PMEM;
else if (sysfs_streq(buf, "ram"))
- mode = CXL_DECODER_RAM;
+ mode = CXL_PARTMODE_RAM;
else
return -EINVAL;
- rc = cxl_dpa_set_mode(cxled, mode);
+ rc = cxl_dpa_set_part(cxled, mode);
if (rc)
return rc;
@@ -549,13 +559,9 @@ static ssize_t decoders_committed_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_port *port = to_cxl_port(dev);
- int rc;
-
- down_read(&cxl_region_rwsem);
- rc = sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
- up_read(&cxl_region_rwsem);
- return rc;
+ guard(rwsem_read)(&cxl_region_rwsem);
+ return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
}
static DEVICE_ATTR_RO(decoders_committed);
@@ -1672,6 +1678,8 @@ retry:
if (rc && rc != -EBUSY)
return rc;
+ cxl_gpf_port_setup(dport_dev, port);
+
/* Any more ports to add between this one and the root? */
if (!dev_is_cxl_root_child(&port->dev))
continue;
@@ -1899,6 +1907,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
return ERR_PTR(-ENOMEM);
cxled->pos = -1;
+ cxled->part = -1;
cxld = &cxled->cxld;
rc = cxl_decoder_init(port, cxld);
if (rc) {
@@ -2339,8 +2348,14 @@ static __init int cxl_core_init(void)
if (rc)
goto err_region;
+ rc = cxl_ras_init();
+ if (rc)
+ goto err_ras;
+
return 0;
+err_ras:
+ cxl_region_exit();
err_region:
bus_unregister(&cxl_bus_type);
err_bus:
@@ -2352,6 +2367,7 @@ err_wq:
static void cxl_core_exit(void)
{
+ cxl_ras_exit();
cxl_region_exit();
bus_unregister(&cxl_bus_type);
destroy_workqueue(cxl_bus_wq);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
new file mode 100644
index 000000000000..485a831695c7
--- /dev/null
+++ b/drivers/cxl/core/ras.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <cxl/event.h>
+#include <cxlmem.h>
+#include "trace.h"
+
+static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+ trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+}
+
+static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ u32 fe;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+ struct cxl_dev_state *cxlds;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+
+static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ struct cxl_dev_state *cxlds;
+ u32 fe;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+ unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+ data->prot_err.agent_addr.function);
+ struct pci_dev *pdev __free(pci_dev_put) =
+ pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
+ data->prot_err.agent_addr.bus,
+ devfn);
+ int port_type;
+
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+
+ port_type = pci_pcie_type(pdev);
+ if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+ port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+ port_type == PCI_EXP_TYPE_UPSTREAM) {
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
+
+ return;
+ }
+
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+ struct cxl_cper_prot_err_work_data wd;
+
+ while (cxl_cper_prot_err_kfifo_get(&wd))
+ cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
+int cxl_ras_init(void)
+{
+ return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+}
+
+void cxl_ras_exit(void)
+{
+ cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
+ cancel_work_sync(&cxl_cper_prot_err_work);
+}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e8d11a988fd9..c3f4dc244df7 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -144,7 +144,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
rc = down_read_interruptible(&cxl_region_rwsem);
if (rc)
return rc;
- if (cxlr->mode != CXL_DECODER_PMEM)
+ if (cxlr->mode != CXL_PARTMODE_PMEM)
rc = sysfs_emit(buf, "\n");
else
rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
@@ -441,7 +441,7 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
* Support tooling that expects to find a 'uuid' attribute for all
* regions regardless of mode.
*/
- if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM)
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
return 0444;
return a->mode;
}
@@ -603,8 +603,16 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_region *cxlr = to_cxl_region(dev);
+ const char *desc;
- return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode));
+ if (cxlr->mode == CXL_PARTMODE_RAM)
+ desc = "ram";
+ else if (cxlr->mode == CXL_PARTMODE_PMEM)
+ desc = "pmem";
+ else
+ desc = "";
+
+ return sysfs_emit(buf, "%s\n", desc);
}
static DEVICE_ATTR_RO(mode);
@@ -630,7 +638,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
/* ways, granularity and uuid (if PMEM) need to be set before HPA */
if (!p->interleave_ways || !p->interleave_granularity ||
- (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid)))
+ (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
return -ENXIO;
div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
@@ -824,6 +832,21 @@ static int match_free_decoder(struct device *dev, const void *data)
return 1;
}
+static bool region_res_match_cxl_range(const struct cxl_region_params *p,
+ struct range *range)
+{
+ if (!p->res)
+ return false;
+
+ /*
+ * If an extended linear cache region then the CXL range is assumed
+ * to be fronted by the DRAM range in current known implementation.
+ * This assumption will be made until a variant implementation exists.
+ */
+ return p->res->start + p->cache_size == range->start &&
+ p->res->end == range->end;
+}
+
static int match_auto_decoder(struct device *dev, const void *data)
{
const struct cxl_region_params *p = data;
@@ -836,7 +859,7 @@ static int match_auto_decoder(struct device *dev, const void *data)
cxld = to_cxl_decoder(dev);
r = &cxld->hpa_range;
- if (p->res && p->res->start == r->start && p->res->end == r->end)
+ if (region_res_match_cxl_range(p, r))
return 1;
return 0;
@@ -1424,8 +1447,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
if (cxld->interleave_ways != iw ||
cxld->interleave_granularity != ig ||
- cxld->hpa_range.start != p->res->start ||
- cxld->hpa_range.end != p->res->end ||
+ !region_res_match_cxl_range(p, &cxld->hpa_range) ||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
dev_err(&cxlr->dev,
"%s:%s %s expected iw: %d ig: %d %pr\n",
@@ -1888,6 +1910,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_region_params *p = &cxlr->params;
struct cxl_port *ep_port, *root_port;
struct cxl_dport *dport;
@@ -1902,17 +1925,17 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return rc;
}
- if (cxled->mode != cxlr->mode) {
- dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
- dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
- return -EINVAL;
- }
-
- if (cxled->mode == CXL_DECODER_DEAD) {
+ if (cxled->part < 0) {
dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
return -ENODEV;
}
+ if (cxlds->part[cxled->part].mode != cxlr->mode) {
+ dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
+ dev_name(&cxled->cxld.dev), cxlr->mode);
+ return -EINVAL;
+ }
+
/* all full of members, or interleave config not established? */
if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
dev_dbg(&cxlr->dev, "region already active\n");
@@ -1951,13 +1974,13 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return -ENXIO;
}
- if (resource_size(cxled->dpa_res) * p->interleave_ways !=
+ if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
resource_size(p->res)) {
dev_dbg(&cxlr->dev,
- "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n",
+ "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
(u64)resource_size(cxled->dpa_res), p->interleave_ways,
- (u64)resource_size(p->res));
+ (u64)p->cache_size, (u64)resource_size(p->res));
return -EINVAL;
}
@@ -2115,7 +2138,7 @@ out:
void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
{
down_write(&cxl_region_rwsem);
- cxled->mode = CXL_DECODER_DEAD;
+ cxled->part = -1;
cxl_region_detach(cxled);
up_write(&cxl_region_rwsem);
}
@@ -2471,7 +2494,7 @@ static int cxl_region_calculate_adistance(struct notifier_block *nb,
*/
static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
int id,
- enum cxl_decoder_mode mode,
+ enum cxl_partition_mode mode,
enum cxl_decoder_type type)
{
struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
@@ -2525,13 +2548,13 @@ static ssize_t create_ram_region_show(struct device *dev,
}
static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
- enum cxl_decoder_mode mode, int id)
+ enum cxl_partition_mode mode, int id)
{
int rc;
switch (mode) {
- case CXL_DECODER_RAM:
- case CXL_DECODER_PMEM:
+ case CXL_PARTMODE_RAM:
+ case CXL_PARTMODE_PMEM:
break;
default:
dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
@@ -2551,7 +2574,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
}
static ssize_t create_region_store(struct device *dev, const char *buf,
- size_t len, enum cxl_decoder_mode mode)
+ size_t len, enum cxl_partition_mode mode)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
struct cxl_region *cxlr;
@@ -2572,7 +2595,7 @@ static ssize_t create_pmem_region_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return create_region_store(dev, buf, len, CXL_DECODER_PMEM);
+ return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
}
DEVICE_ATTR_RW(create_pmem_region);
@@ -2580,7 +2603,7 @@ static ssize_t create_ram_region_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return create_region_store(dev, buf, len, CXL_DECODER_RAM);
+ return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
}
DEVICE_ATTR_RW(create_ram_region);
@@ -2678,7 +2701,7 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL");
struct cxl_poison_context {
struct cxl_port *port;
- enum cxl_decoder_mode mode;
+ int part;
u64 offset;
};
@@ -2686,47 +2709,45 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
struct cxl_poison_context *ctx)
{
struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ const struct resource *res;
+ struct resource *p, *last;
u64 offset, length;
int rc = 0;
+ if (ctx->part < 0)
+ return 0;
+
/*
- * Collect poison for the remaining unmapped resources
- * after poison is collected by committed endpoints.
- *
- * Knowing that PMEM must always follow RAM, get poison
- * for unmapped resources based on the last decoder's mode:
- * ram: scan remains of ram range, then any pmem range
- * pmem: scan remains of pmem range
+ * Collect poison for the remaining unmapped resources after
+ * poison is collected by committed endpoints decoders.
*/
-
- if (ctx->mode == CXL_DECODER_RAM) {
- offset = ctx->offset;
- length = resource_size(&cxlds->ram_res) - offset;
+ for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
+ res = &cxlds->part[i].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last)
+ offset = last->end + 1;
+ else
+ offset = res->start;
+ length = res->end - offset + 1;
+ if (!length)
+ break;
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc == -EFAULT)
- rc = 0;
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
+ continue;
if (rc)
- return rc;
- }
- if (ctx->mode == CXL_DECODER_PMEM) {
- offset = ctx->offset;
- length = resource_size(&cxlds->dpa_res) - offset;
- if (!length)
- return 0;
- } else if (resource_size(&cxlds->pmem_res)) {
- offset = cxlds->pmem_res.start;
- length = resource_size(&cxlds->pmem_res);
- } else {
- return 0;
+ break;
}
- return cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ return rc;
}
static int poison_by_decoder(struct device *dev, void *arg)
{
struct cxl_poison_context *ctx = arg;
struct cxl_endpoint_decoder *cxled;
+ enum cxl_partition_mode mode;
+ struct cxl_dev_state *cxlds;
struct cxl_memdev *cxlmd;
u64 offset, length;
int rc = 0;
@@ -2735,27 +2756,18 @@ static int poison_by_decoder(struct device *dev, void *arg)
return rc;
cxled = to_cxl_endpoint_decoder(dev);
- if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
- return rc;
-
- /*
- * Regions are only created with single mode decoders: pmem or ram.
- * Linux does not support mixed mode decoders. This means that
- * reading poison per endpoint decoder adheres to the requirement
- * that poison reads of pmem and ram must be separated.
- * CXL 3.0 Spec 8.2.9.8.4.1
- */
- if (cxled->mode == CXL_DECODER_MIXED) {
- dev_dbg(dev, "poison list read unsupported in mixed mode\n");
+ if (!cxled->dpa_res)
return rc;
- }
cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+ mode = cxlds->part[cxled->part].mode;
+
if (cxled->skip) {
offset = cxled->dpa_res->start - cxled->skip;
length = cxled->skip;
rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
- if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
rc = 0;
if (rc)
return rc;
@@ -2764,7 +2776,7 @@ static int poison_by_decoder(struct device *dev, void *arg)
offset = cxled->dpa_res->start;
length = cxled->dpa_res->end - offset + 1;
rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
- if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
rc = 0;
if (rc)
return rc;
@@ -2772,7 +2784,7 @@ static int poison_by_decoder(struct device *dev, void *arg)
/* Iterate until commit_end is reached */
if (cxled->cxld.id == ctx->port->commit_end) {
ctx->offset = cxled->dpa_res->end + 1;
- ctx->mode = cxled->mode;
+ ctx->part = cxled->part;
return 1;
}
@@ -2785,7 +2797,8 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port)
int rc = 0;
ctx = (struct cxl_poison_context) {
- .port = port
+ .port = port,
+ .part = -1,
};
rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
@@ -2921,7 +2934,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
/* Apply the hpa_offset to the region base address */
- hpa = hpa_offset + p->res->start;
+ hpa = hpa_offset + p->res->start + p->cache_size;
/* Root decoder translation overrides typical modulo decode */
if (cxlrd->hpa_to_spa)
@@ -3038,17 +3051,13 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
struct cxl_dax_region *cxlr_dax;
struct device *dev;
- down_read(&cxl_region_rwsem);
- if (p->state != CXL_CONFIG_COMMIT) {
- cxlr_dax = ERR_PTR(-ENXIO);
- goto out;
- }
+ guard(rwsem_read)(&cxl_region_rwsem);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return ERR_PTR(-ENXIO);
cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL);
- if (!cxlr_dax) {
- cxlr_dax = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (!cxlr_dax)
+ return ERR_PTR(-ENOMEM);
cxlr_dax->hpa_range.start = p->res->start;
cxlr_dax->hpa_range.end = p->res->end;
@@ -3061,8 +3070,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
dev->parent = &cxlr->dev;
dev->bus = &cxl_bus_type;
dev->type = &cxl_dax_region_type;
-out:
- up_read(&cxl_region_rwsem);
return cxlr_dax;
}
@@ -3208,7 +3215,6 @@ static int match_region_by_range(struct device *dev, const void *data)
struct cxl_region_params *p;
struct cxl_region *cxlr;
const struct range *r = data;
- int rc = 0;
if (!is_cxl_region(dev))
return 0;
@@ -3216,60 +3222,96 @@ static int match_region_by_range(struct device *dev, const void *data)
cxlr = to_cxl_region(dev);
p = &cxlr->params;
- down_read(&cxl_region_rwsem);
+ guard(rwsem_read)(&cxl_region_rwsem);
if (p->res && p->res->start == r->start && p->res->end == r->end)
- rc = 1;
- up_read(&cxl_region_rwsem);
+ return 1;
- return rc;
+ return 0;
}
-/* Establish an empty region covering the given HPA range */
-static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
- struct cxl_endpoint_decoder *cxled)
+static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
+ struct resource *res)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ int nid = phys_to_target_node(res->start);
+ resource_size_t size = resource_size(res);
+ resource_size_t cache_size, start;
+ int rc;
+
+ rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size);
+ if (rc)
+ return rc;
+
+ if (!cache_size)
+ return 0;
+
+ if (size != cache_size) {
+ dev_warn(&cxlr->dev,
+ "Extended Linear Cache size %pa != CXL size %pa. No Support!",
+ &cache_size, &size);
+ return -ENXIO;
+ }
+
+ /*
+ * Move the start of the range to where the cache range starts. The
+ * implementation assumes that the cache range is in front of the
+ * CXL range. This is not dictated by the HMAT spec but is how the
+ * current known implementation is configured.
+ *
+ * The cache range is expected to be within the CFMWS. The adjusted
+ * res->start should not be less than cxlrd->res->start.
+ */
+ start = res->start - cache_size;
+ if (start < cxlrd->res->start)
+ return -ENXIO;
+
+ res->start = start;
+ p->cache_size = cache_size;
+
+ return 0;
+}
+
+static int __construct_region(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
{
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- struct cxl_port *port = cxlrd_to_port(cxlrd);
struct range *hpa = &cxled->cxld.hpa_range;
struct cxl_region_params *p;
- struct cxl_region *cxlr;
struct resource *res;
int rc;
- do {
- cxlr = __create_region(cxlrd, cxled->mode,
- atomic_read(&cxlrd->region_id));
- } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
-
- if (IS_ERR(cxlr)) {
- dev_err(cxlmd->dev.parent,
- "%s:%s: %s failed assign region: %ld\n",
- dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
- __func__, PTR_ERR(cxlr));
- return cxlr;
- }
-
- down_write(&cxl_region_rwsem);
+ guard(rwsem_write)(&cxl_region_rwsem);
p = &cxlr->params;
if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
dev_err(cxlmd->dev.parent,
"%s:%s: %s autodiscovery interrupted\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
__func__);
- rc = -EBUSY;
- goto err;
+ return -EBUSY;
}
set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res) {
- rc = -ENOMEM;
- goto err;
- }
+ if (!res)
+ return -ENOMEM;
*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
dev_name(&cxlr->dev));
+
+ rc = cxl_extended_linear_cache_resize(cxlr, res);
+ if (rc && rc != -EOPNOTSUPP) {
+ /*
+ * Failing to support extended linear cache region resize does not
+ * prevent the region from functioning. Only causes cxl list showing
+ * incorrect region size.
+ */
+ dev_warn(cxlmd->dev.parent,
+ "Extended linear cache calculation failed rc:%d\n", rc);
+ }
+
rc = insert_resource(cxlrd->res, res);
if (rc) {
/*
@@ -3289,7 +3331,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
if (rc)
- goto err;
+ return rc;
dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
@@ -3298,14 +3340,40 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
/* ...to match put_device() in cxl_add_to_region() */
get_device(&cxlr->dev);
- up_write(&cxl_region_rwsem);
- return cxlr;
+ return 0;
+}
-err:
- up_write(&cxl_region_rwsem);
- devm_release_action(port->uport_dev, unregister_region, cxlr);
- return ERR_PTR(rc);
+/* Establish an empty region covering the given HPA range */
+static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxlrd_to_port(cxlrd);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ int rc, part = READ_ONCE(cxled->part);
+ struct cxl_region *cxlr;
+
+ do {
+ cxlr = __create_region(cxlrd, cxlds->part[part].mode,
+ atomic_read(&cxlrd->region_id));
+ } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
+
+ if (IS_ERR(cxlr)) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s failed assign region: %ld\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, PTR_ERR(cxlr));
+ return cxlr;
+ }
+
+ rc = __construct_region(cxlr, cxlrd, cxled);
+ if (rc) {
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
+ return ERR_PTR(rc);
+ }
+
+ return cxlr;
}
int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
@@ -3375,6 +3443,34 @@ out:
}
EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
+{
+ struct cxl_region_ref *iter;
+ unsigned long index;
+
+ if (!endpoint)
+ return ~0ULL;
+
+ guard(rwsem_write)(&cxl_region_rwsem);
+
+ xa_for_each(&endpoint->regions, index, iter) {
+ struct cxl_region_params *p = &iter->region->params;
+
+ if (p->res->start <= spa && spa <= p->res->end) {
+ if (!p->cache_size)
+ return ~0ULL;
+
+ if (spa >= p->res->start + p->cache_size)
+ return spa - p->cache_size;
+
+ return spa + p->cache_size;
+ }
+ }
+
+ return ~0ULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
+
static int is_system_ram(struct resource *res, void *arg)
{
struct cxl_region *cxlr = arg;
@@ -3440,9 +3536,9 @@ out:
return rc;
switch (cxlr->mode) {
- case CXL_DECODER_PMEM:
+ case CXL_PARTMODE_PMEM:
return devm_cxl_add_pmem_region(cxlr);
- case CXL_DECODER_RAM:
+ case CXL_PARTMODE_RAM:
/*
* The region can not be manged by CXL if any portion of
* it is already online as 'System RAM'
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index cea706b683b5..25ebfbc1616c 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,6 +48,34 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
+TRACE_EVENT(cxl_port_aer_uncorrectable_error,
+ TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(dev, status, fe, hl),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ __field(u32, first_error)
+ __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ __entry->first_error = fe;
+ /*
+ * Embed the 512B headerlog data for user app retrieval and
+ * parsing, but no need to print this in the trace buffer.
+ */
+ memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+ ),
+ TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
+ __get_str(device), __get_str(host),
+ show_uc_errs(__entry->status),
+ show_uc_errs(__entry->first_error)
+ )
+);
+
TRACE_EVENT(cxl_aer_uncorrectable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
TP_ARGS(cxlmd, status, fe, hl),
@@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
+TRACE_EVENT(cxl_port_aer_correctable_error,
+ TP_PROTO(struct device *dev, u32 status),
+ TP_ARGS(dev, status),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ ),
+ TP_printk("device=%s host=%s status='%s'",
+ __get_str(device), __get_str(host),
+ show_ce_errs(__entry->status)
+ )
+);
+
TRACE_EVENT(cxl_aer_correctable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
TP_ARGS(cxlmd, status),
@@ -392,9 +439,10 @@ TRACE_EVENT(cxl_generic_event,
TRACE_EVENT(cxl_general_media,
TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
- struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec),
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_gen_media *rec),
- TP_ARGS(cxlmd, log, cxlr, hpa, rec),
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
TP_STRUCT__entry(
CXL_EVT_TP_entry
@@ -408,6 +456,7 @@ TRACE_EVENT(cxl_general_media,
__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
/* Following are out of order to pack trace record */
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field_struct(uuid_t, region_uuid)
__field(u16, validity_flags)
__field(u8, rank)
@@ -438,6 +487,7 @@ TRACE_EVENT(cxl_general_media,
CXL_EVENT_GEN_MED_COMP_ID_SIZE);
__entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags);
__entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
if (cxlr) {
__assign_str(region_name);
uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
@@ -455,7 +505,7 @@ TRACE_EVENT(cxl_general_media,
"device=%x validity_flags='%s' " \
"comp_id=%s comp_id_pldm_valid_flags='%s' " \
"pldm_entity_id=%s pldm_resource_id=%s " \
- "hpa=%llx region=%s region_uuid=%pUb " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
"cme_threshold_ev_flags='%s' cme_count=%u",
__entry->dpa, show_dpa_flags(__entry->dpa_flags),
show_event_desc_flags(__entry->descriptor),
@@ -470,7 +520,7 @@ TRACE_EVENT(cxl_general_media,
CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT,
CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
- __entry->hpa, __get_str(region_name), &__entry->region_uuid,
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count
)
);
@@ -529,9 +579,10 @@ TRACE_EVENT(cxl_general_media,
TRACE_EVENT(cxl_dram,
TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
- struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec),
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_dram *rec),
- TP_ARGS(cxlmd, log, cxlr, hpa, rec),
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
TP_STRUCT__entry(
CXL_EVT_TP_entry
@@ -547,6 +598,7 @@ TRACE_EVENT(cxl_dram,
__field(u32, row)
__array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field_struct(uuid_t, region_uuid)
__field(u8, rank) /* Out of order to pack trace record */
__field(u8, bank_group) /* Out of order to pack trace record */
@@ -584,6 +636,7 @@ TRACE_EVENT(cxl_dram,
memcpy(__entry->cor_mask, &rec->correction_mask,
CXL_EVENT_DER_CORRECTION_MASK_SIZE);
__entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
if (cxlr) {
__assign_str(region_name);
uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
@@ -604,7 +657,7 @@ TRACE_EVENT(cxl_dram,
"validity_flags='%s' " \
"comp_id=%s comp_id_pldm_valid_flags='%s' " \
"pldm_entity_id=%s pldm_resource_id=%s " \
- "hpa=%llx region=%s region_uuid=%pUb " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
"sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u",
__entry->dpa, show_dpa_flags(__entry->dpa_flags),
show_event_desc_flags(__entry->descriptor),
@@ -622,7 +675,7 @@ TRACE_EVENT(cxl_dram,
CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT,
CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
- __entry->hpa, __get_str(region_name), &__entry->region_uuid,
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
__entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags),
__entry->cvme_count
)
@@ -870,6 +923,7 @@ TRACE_EVENT(cxl_poison,
__string(region, cxlr ? dev_name(&cxlr->dev) : "")
__field(u64, overflow_ts)
__field(u64, hpa)
+ __field(u64, hpa_alias0)
__field(u64, dpa)
__field(u32, dpa_length)
__array(char, uuid, 16)
@@ -892,16 +946,22 @@ TRACE_EVENT(cxl_poison,
memcpy(__entry->uuid, &cxlr->params.uuid, 16);
__entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd,
__entry->dpa);
+ if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size)
+ __entry->hpa_alias0 = __entry->hpa +
+ cxlr->params.cache_size;
+ else
+ __entry->hpa_alias0 = ULLONG_MAX;
} else {
__assign_str(region);
memset(__entry->uuid, 0, 16);
__entry->hpa = ULLONG_MAX;
+ __entry->hpa_alias0 = ULLONG_MAX;
}
),
TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \
- "region_uuid=%pU hpa=0x%llx dpa=0x%llx dpa_length=0x%x " \
- "source=%s flags=%s overflow_time=%llu",
+ "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \
+ "dpa_length=0x%x source=%s flags=%s overflow_time=%llu",
__get_str(memdev),
__get_str(host),
__entry->serial,
@@ -909,6 +969,7 @@ TRACE_EVENT(cxl_poison,
__get_str(region),
__entry->uuid,
__entry->hpa,
+ __entry->hpa_alias0,
__entry->dpa,
__entry->dpa_length,
show_poison_source(__entry->source),
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index bbbaa0d0a670..be8a7dc77719 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -373,32 +373,6 @@ struct cxl_decoder {
};
/*
- * CXL_DECODER_DEAD prevents endpoints from being reattached to regions
- * while cxld_unregister() is running
- */
-enum cxl_decoder_mode {
- CXL_DECODER_NONE,
- CXL_DECODER_RAM,
- CXL_DECODER_PMEM,
- CXL_DECODER_MIXED,
- CXL_DECODER_DEAD,
-};
-
-static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
-{
- static const char * const names[] = {
- [CXL_DECODER_NONE] = "none",
- [CXL_DECODER_RAM] = "ram",
- [CXL_DECODER_PMEM] = "pmem",
- [CXL_DECODER_MIXED] = "mixed",
- };
-
- if (mode >= CXL_DECODER_NONE && mode <= CXL_DECODER_MIXED)
- return names[mode];
- return "mixed";
-}
-
-/*
* Track whether this decoder is reserved for region autodiscovery, or
* free for userspace provisioning.
*/
@@ -412,16 +386,16 @@ enum cxl_decoder_state {
* @cxld: base cxl_decoder_object
* @dpa_res: actively claimed DPA span of this decoder
* @skip: offset into @dpa_res where @cxld.hpa_range maps
- * @mode: which memory type / access-mode-partition this decoder targets
* @state: autodiscovery state
+ * @part: partition index this decoder maps
* @pos: interleave position in @cxld.region
*/
struct cxl_endpoint_decoder {
struct cxl_decoder cxld;
struct resource *dpa_res;
resource_size_t skip;
- enum cxl_decoder_mode mode;
enum cxl_decoder_state state;
+ int part;
int pos;
};
@@ -493,6 +467,7 @@ enum cxl_config_state {
* @res: allocated iomem capacity for this region
* @targets: active ordered targets in current decoder configuration
* @nr_targets: number of targets
+ * @cache_size: extended linear cache size if exists, otherwise zero.
*
* State transitions are protected by the cxl_region_rwsem
*/
@@ -504,6 +479,12 @@ struct cxl_region_params {
struct resource *res;
struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE];
int nr_targets;
+ resource_size_t cache_size;
+};
+
+enum cxl_partition_mode {
+ CXL_PARTMODE_RAM,
+ CXL_PARTMODE_PMEM,
};
/*
@@ -525,7 +506,7 @@ struct cxl_region_params {
* struct cxl_region - CXL region
* @dev: This region's device
* @id: This region's id. Id is globally unique across all regions
- * @mode: Endpoint decoder allocation / access mode
+ * @mode: Operational mode of the mapped capacity
* @type: Endpoint decoder target type
* @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
* @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
@@ -538,7 +519,7 @@ struct cxl_region_params {
struct cxl_region {
struct device dev;
int id;
- enum cxl_decoder_mode mode;
+ enum cxl_partition_mode mode;
enum cxl_decoder_type type;
struct cxl_nvdimm_bridge *cxl_nvb;
struct cxl_pmem_region *cxlr_pmem;
@@ -563,6 +544,7 @@ struct cxl_nvdimm {
struct device dev;
struct cxl_memdev *cxlmd;
u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */
+ u64 dirty_shutdowns;
};
struct cxl_pmem_region_mapping {
@@ -610,6 +592,7 @@ struct cxl_dax_region {
* @cdat: Cached CDAT data
* @cdat_available: Should a CDAT attribute be available in sysfs
* @pci_latency: Upstream latency in picoseconds
+ * @gpf_dvsec: Cached GPF port DVSEC
*/
struct cxl_port {
struct device dev;
@@ -633,6 +616,7 @@ struct cxl_port {
} cdat;
bool cdat_available;
long pci_latency;
+ int gpf_dvsec;
};
/**
@@ -875,6 +859,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
int cxl_add_to_region(struct cxl_port *root,
struct cxl_endpoint_decoder *cxled);
struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
#else
static inline bool is_cxl_pmem_region(struct device *dev)
{
@@ -893,6 +878,11 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
{
return NULL;
}
+static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint,
+ u64 spa)
+{
+ return 0;
+}
#endif
void cxl_endpoint_parse_cdat(struct cxl_port *port);
@@ -920,4 +910,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
#define __mock static
#endif
+u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port);
+
#endif /* __CXL_H__ */
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index dd2b7060d501..3ec6b906371b 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -97,6 +97,19 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
resource_size_t base, resource_size_t len,
resource_size_t skipped);
+#define CXL_NR_PARTITIONS_MAX 2
+
+struct cxl_dpa_info {
+ u64 size;
+ struct cxl_dpa_part_info {
+ struct range range;
+ enum cxl_partition_mode mode;
+ } part[CXL_NR_PARTITIONS_MAX];
+ int nr_partitions;
+};
+
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info);
+
static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
struct cxl_memdev *cxlmd)
{
@@ -373,6 +386,18 @@ struct cxl_dpa_perf {
};
/**
+ * struct cxl_dpa_partition - DPA partition descriptor
+ * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res)
+ * @perf: performance attributes of the partition from CDAT
+ * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic...
+ */
+struct cxl_dpa_partition {
+ struct resource res;
+ struct cxl_dpa_perf perf;
+ enum cxl_partition_mode mode;
+};
+
+/**
* struct cxl_dev_state - The driver device state
*
* cxl_dev_state represents the CXL driver/device state. It provides an
@@ -387,8 +412,8 @@ struct cxl_dpa_perf {
* @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH)
* @media_ready: Indicate whether the device media is usable
* @dpa_res: Overall DPA resource tree for the device
- * @pmem_res: Active Persistent memory capacity configuration
- * @ram_res: Active Volatile memory capacity configuration
+ * @part: DPA partition array
+ * @nr_partitions: Number of DPA partitions
* @serial: PCIe Device Serial Number
* @type: Generic Memory Class device or Vendor Specific Memory device
* @cxl_mbox: CXL mailbox context
@@ -403,8 +428,8 @@ struct cxl_dev_state {
bool rcd;
bool media_ready;
struct resource dpa_res;
- struct resource pmem_res;
- struct resource ram_res;
+ struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX];
+ unsigned int nr_partitions;
u64 serial;
enum cxl_devtype type;
struct cxl_mailbox cxl_mbox;
@@ -413,6 +438,18 @@ struct cxl_dev_state {
#endif
};
+static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
+{
+ /*
+ * Static PMEM may be at partition index 0 when there is no static RAM
+ * capacity.
+ */
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return resource_size(&cxlds->part[i].res);
+ return 0;
+}
+
static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
{
return dev_get_drvdata(cxl_mbox->host);
@@ -435,14 +472,11 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
* @partition_align_bytes: alignment size for partition-able capacity
* @active_volatile_bytes: sum of hard + soft volatile
* @active_persistent_bytes: sum of hard + soft persistent
- * @next_volatile_bytes: volatile capacity change pending device reset
- * @next_persistent_bytes: persistent capacity change pending device reset
- * @ram_perf: performance data entry matched to RAM partition
- * @pmem_perf: performance data entry matched to PMEM partition
* @event: event log driver state
* @poison: poison driver state info
* @security: security driver state info
* @fw: firmware upload / activation state
+ * @mce_notifier: MCE notifier
*
* See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
* details on capacity parameters.
@@ -457,16 +491,12 @@ struct cxl_memdev_state {
u64 partition_align_bytes;
u64 active_volatile_bytes;
u64 active_persistent_bytes;
- u64 next_volatile_bytes;
- u64 next_persistent_bytes;
-
- struct cxl_dpa_perf ram_perf;
- struct cxl_dpa_perf pmem_perf;
struct cxl_event_state event;
struct cxl_poison_state poison;
struct cxl_security_state security;
struct cxl_fw_state fw;
+ struct notifier_block mce_notifier;
};
static inline struct cxl_memdev_state *
@@ -660,6 +690,23 @@ struct cxl_mbox_set_partition_info {
#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0)
+/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */
+struct cxl_mbox_get_health_info_out {
+ u8 health_status;
+ u8 media_status;
+ u8 additional_status;
+ u8 life_used;
+ __le16 device_temperature;
+ __le32 dirty_shutdown_cnt;
+ __le32 corrected_volatile_error_cnt;
+ __le32 corrected_persistent_error_cnt;
+} __packed;
+
+/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */
+struct cxl_mbox_set_shutdown_state_in {
+ u8 state;
+} __packed;
+
/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
struct cxl_mbox_set_timestamp_in {
__le64 timestamp;
@@ -785,7 +832,7 @@ int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
int cxl_dev_state_identify(struct cxl_memdev_state *mds);
int cxl_await_media_ready(struct cxl_dev_state *cxlds);
int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
-int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info);
struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev);
void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
unsigned long *cmds);
@@ -796,6 +843,8 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
enum cxl_event_log_type type,
enum cxl_event_type event_type,
const uuid_t *uuid, union cxl_event *evt);
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count);
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds);
int cxl_set_timestamp(struct cxl_memdev_state *mds);
int cxl_poison_state_init(struct cxl_memdev_state *mds);
int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..54e219b0049e 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -40,6 +40,12 @@
/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
#define CXL_DVSEC_PORT_GPF 4
+#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8)
/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
#define CXL_DVSEC_DEVICE_GPF 5
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 2f03a4d5606e..9675243bd05b 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -152,7 +152,7 @@ static int cxl_mem_probe(struct device *dev)
return -ENXIO;
}
- if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) {
+ if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) {
rc = devm_cxl_add_nvdimm(parent_port, cxlmd);
if (rc) {
if (rc == -ENODEV)
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 993fa60fe453..7b14a154463c 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -903,6 +903,7 @@ __ATTRIBUTE_GROUPS(cxl_rcd);
static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
+ struct cxl_dpa_info range_info = { 0 };
struct cxl_memdev_state *mds;
struct cxl_dev_state *cxlds;
struct cxl_register_map map;
@@ -993,7 +994,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (rc)
return rc;
- rc = cxl_mem_create_range_info(mds);
+ rc = cxl_mem_dpa_fetch(mds, &range_info);
+ if (rc)
+ return rc;
+
+ rc = cxl_dpa_setup(cxlds, &range_info);
if (rc)
return rc;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index f9c95996e937..d061fe3d2b86 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -42,15 +42,44 @@ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *
}
static DEVICE_ATTR_RO(id);
+static ssize_t dirty_shutdown_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns);
+}
+static DEVICE_ATTR_RO(dirty_shutdown);
+
static struct attribute *cxl_dimm_attributes[] = {
&dev_attr_id.attr,
&dev_attr_provider.attr,
+ &dev_attr_dirty_shutdown.attr,
NULL
};
+#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX
+static umode_t cxl_dimm_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ if (a == &dev_attr_dirty_shutdown.attr) {
+ struct device *dev = kobj_to_dev(kobj);
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ if (cxl_nvd->dirty_shutdowns ==
+ CXL_INVALID_DIRTY_SHUTDOWN_COUNT)
+ return 0;
+ }
+
+ return a->mode;
+}
+
static const struct attribute_group cxl_dimm_attribute_group = {
.name = "cxl",
.attrs = cxl_dimm_attributes,
+ .is_visible = cxl_dimm_visible
};
static const struct attribute_group *cxl_dimm_attribute_groups[] = {
@@ -58,6 +87,38 @@ static const struct attribute_group *cxl_dimm_attribute_groups[] = {
NULL
};
+static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd)
+{
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ struct device *dev = &cxl_nvd->dev;
+ u32 count;
+
+ /*
+ * Dirty tracking is enabled and exposed to the user, only when:
+ * - dirty shutdown on the device can be set, and,
+ * - the device has a Device GPF DVSEC (albeit unused), and,
+ * - the Get Health Info cmd can retrieve the device's dirty count.
+ */
+ cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT;
+
+ if (cxl_arm_dirty_shutdown(mds)) {
+ dev_warn(dev, "GPF: could not set dirty shutdown state\n");
+ return;
+ }
+
+ if (!cxl_gpf_get_dvsec(cxlds->dev, false))
+ return;
+
+ if (cxl_get_dirty_count(mds, &count)) {
+ dev_warn(dev, "GPF: could not retrieve dirty count\n");
+ return;
+ }
+
+ cxl_nvd->dirty_shutdowns = count;
+}
+
static int cxl_nvdimm_probe(struct device *dev)
{
struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
@@ -78,6 +139,14 @@ static int cxl_nvdimm_probe(struct device *dev)
set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
+
+ /*
+ * Set dirty shutdown now, with the expectation that the device
+ * clear it upon a successful GPF flow. The exception to this
+ * is upon Viral detection, per CXL 3.2 section 12.4.2.
+ */
+ cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd);
+
nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd,
cxl_dimm_attribute_groups, flags,
cmd_mask, 0, NULL, cxl_nvd->dev_id,
@@ -375,6 +444,16 @@ static int cxl_pmem_region_probe(struct device *dev)
goto out_nvd;
}
+ if (cxlds->serial == 0) {
+ /* include missing alongside invalid in this error message. */
+ dev_err(dev, "%s: invalid or missing serial number\n",
+ dev_name(&cxlmd->dev));
+ rc = -ENXIO;
+ goto out_nvd;
+ }
+ info[i].serial = cxlds->serial;
+ info[i].offset = m->start;
+
m->cxl_nvd = cxl_nvd;
mappings[i] = (struct nd_mapping_desc) {
.nvdimm = nvdimm,
@@ -382,8 +461,6 @@ static int cxl_pmem_region_probe(struct device *dev)
.size = m->size,
.position = i,
};
- info[i].offset = m->start;
- info[i].serial = cxlds->serial;
}
ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
ndr_desc.mapping = mappings;
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index b69e68ef3f02..928409199a1a 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -24,7 +24,7 @@
#include <linux/bcd.h>
#include <acpi/ghes.h>
#include <ras/ras_event.h>
-#include "cper_cxl.h"
+#include <cxl/event.h>
/*
* CPER record ID need to be unique even after reboot, because record
@@ -624,11 +624,11 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
else
goto err_section_too_small;
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
- struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+ struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
printk("%ssection_type: CXL Protocol Error\n", newpfx);
if (gdata->error_data_length >= sizeof(*prot_err))
- cper_print_prot_err(newpfx, prot_err);
+ cxl_cper_print_prot_err(newpfx, prot_err);
else
goto err_section_too_small;
} else {
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..8a7667faf953 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,26 +8,7 @@
*/
#include <linux/cper.h>
-#include "cper_cxl.h"
-
-#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
-#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1)
-#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2)
-#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
-#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
-#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
-#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
-
-/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
-struct cxl_ras_capability_regs {
- u32 uncor_status;
- u32 uncor_mask;
- u32 uncor_severity;
- u32 cor_status;
- u32 cor_mask;
- u32 cap_control;
- u32 header_log[16];
-};
+#include <cxl/event.h>
static const char * const prot_err_agent_type_strs[] = {
"Restricted CXL Device",
@@ -40,22 +21,8 @@ static const char * const prot_err_agent_type_strs[] = {
"CXL Upstream Switch Port",
};
-/*
- * The layout of the enumeration and the values matches CXL Agent Type
- * field in the UEFI 2.10 Section N.2.13,
- */
-enum {
- RCD, /* Restricted CXL Device */
- RCH_DP, /* Restricted CXL Host Downstream Port */
- DEVICE, /* CXL Device */
- LD, /* CXL Logical Device */
- FMLD, /* CXL Fabric Manager managed Logical Device */
- RP, /* CXL Root Port */
- DSP, /* CXL Downstream Switch Port */
- USP, /* CXL Upstream Switch Port */
-};
-
-void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
+void cxl_cper_print_prot_err(const char *pfx,
+ const struct cxl_cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type,
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
deleted file mode 100644
index 86bfcf7909ec..000000000000
--- a/drivers/firmware/efi/cper_cxl.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * UEFI Common Platform Error Record (CPER) support for CXL Section.
- *
- * Copyright (C) 2022 Advanced Micro Devices, Inc.
- *
- * Author: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
- */
-
-#ifndef LINUX_CPER_CXL_H
-#define LINUX_CPER_CXL_H
-
-/* CXL Protocol Error Section */
-#define CPER_SEC_CXL_PROT_ERR \
- GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
- 0x4B, 0x77, 0x10, 0x48)
-
-#pragma pack(1)
-
-/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
-struct cper_sec_prot_err {
- u64 valid_bits;
- u8 agent_type;
- u8 reserved[7];
-
- /*
- * Except for RCH Downstream Port, all the remaining CXL Agent
- * types are uniquely identified by the PCIe compatible SBDF number.
- */
- union {
- u64 rcrb_base_addr;
- struct {
- u8 function;
- u8 device;
- u8 bus;
- u16 segment;
- u8 reserved_1[3];
- };
- } agent_addr;
-
- struct {
- u16 vendor_id;
- u16 device_id;
- u16 subsystem_vendor_id;
- u16 subsystem_id;
- u8 class_code[2];
- u16 slot;
- u8 reserved_1[4];
- } device_id;
-
- struct {
- u32 lower_dw;
- u32 upper_dw;
- } dev_serial_num;
-
- u8 capability[60];
- u16 dvsec_len;
- u16 err_len;
- u8 reserved_2[4];
-};
-
-#pragma pack()
-
-void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
-
-#endif //__CPER_CXL_
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 04edd44bd26f..f9ae1796da85 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -164,10 +164,99 @@ struct cxl_cper_work_data {
struct cxl_cper_event_rec rec;
};
+#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0)
+#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1)
+#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2)
+#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3)
+#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4)
+#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
+#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
+
+/*
+ * The layout of the enumeration and the values matches CXL Agent Type
+ * field in the UEFI 2.10 Section N.2.13,
+ */
+enum {
+ RCD, /* Restricted CXL Device */
+ RCH_DP, /* Restricted CXL Host Downstream Port */
+ DEVICE, /* CXL Device */
+ LD, /* CXL Logical Device */
+ FMLD, /* CXL Fabric Manager managed Logical Device */
+ RP, /* CXL Root Port */
+ DSP, /* CXL Downstream Switch Port */
+ USP, /* CXL Upstream Switch Port */
+};
+
+#pragma pack(1)
+
+/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
+struct cxl_cper_sec_prot_err {
+ u64 valid_bits;
+ u8 agent_type;
+ u8 reserved[7];
+
+ /*
+ * Except for RCH Downstream Port, all the remaining CXL Agent
+ * types are uniquely identified by the PCIe compatible SBDF number.
+ */
+ union {
+ u64 rcrb_base_addr;
+ struct {
+ u8 function;
+ u8 device;
+ u8 bus;
+ u16 segment;
+ u8 reserved_1[3];
+ };
+ } agent_addr;
+
+ struct {
+ u16 vendor_id;
+ u16 device_id;
+ u16 subsystem_vendor_id;
+ u16 subsystem_id;
+ u8 class_code[2];
+ u16 slot;
+ u8 reserved_1[4];
+ } device_id;
+
+ struct {
+ u32 lower_dw;
+ u32 upper_dw;
+ } dev_serial_num;
+
+ u8 capability[60];
+ u16 dvsec_len;
+ u16 err_len;
+ u8 reserved_2[4];
+};
+
+#pragma pack()
+
+/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
+struct cxl_ras_capability_regs {
+ u32 uncor_status;
+ u32 uncor_mask;
+ u32 uncor_severity;
+ u32 cor_status;
+ u32 cor_mask;
+ u32 cap_control;
+ u32 header_log[16];
+};
+
+struct cxl_cper_prot_err_work_data {
+ struct cxl_cper_sec_prot_err prot_err;
+ struct cxl_ras_capability_regs ras_cap;
+ int severity;
+};
+
#ifdef CONFIG_ACPI_APEI_GHES
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
#else
static inline int cxl_cper_register_work(struct work_struct *work)
{
@@ -182,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
{
return 0;
}
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return 0;
+}
#endif
#endif /* _LINUX_CXL_EVENT_H */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a70e62d69dc7..3f2e93ed9730 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1094,6 +1094,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu)
#endif /* !CONFIG_ACPI */
+#ifdef CONFIG_ACPI_HMAT
+int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid,
+ resource_size_t *size);
+#else
+static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res,
+ int nid, resource_size_t *size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
extern void arch_post_acpi_subsys_init(void);
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 265b0f8fc0b3..0ed60a91eca9 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -89,6 +89,10 @@ enum {
#define CPER_NOTIFY_DMAR \
GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
0x72, 0x2D, 0xEB, 0x41)
+/* CXL Protocol Error Section */
+#define CPER_SEC_CXL_PROT_ERR \
+ GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
+ 0x4B, 0x77, 0x10, 0x48)
/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
/*
@@ -601,4 +605,8 @@ void cper_estatus_print(const char *pfx,
int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus);
int cper_estatus_check(const struct acpi_hest_generic_status *estatus);
+struct cxl_cper_sec_prot_err;
+void cxl_cper_print_prot_err(const char *pfx,
+ const struct cxl_cper_sec_prot_err *prot_err);
+
#endif
diff --git a/include/linux/node.h b/include/linux/node.h
index 9a881c2208b3..2b7517892230 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -57,6 +57,11 @@ enum cache_write_policy {
NODE_CACHE_WRITE_OTHER,
};
+enum cache_mode {
+ NODE_CACHE_ADDR_MODE_RESERVED,
+ NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
+};
+
/**
* struct node_cache_attrs - system memory caching attributes
*
@@ -65,6 +70,7 @@ enum cache_write_policy {
* @size: Total size of cache in bytes
* @line_size: Number of bytes fetched on a cache miss
* @level: The cache hierarchy level
+ * @address_mode: The address mode
*/
struct node_cache_attrs {
enum cache_indexing indexing;
@@ -72,6 +78,7 @@ struct node_cache_attrs {
u64 size;
u16 line_size;
u8 level;
+ u16 address_mode;
};
#ifdef CONFIG_HMEM_REPORTING
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0a6572ab6f37..387f3df8b988 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -61,8 +61,11 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
cxl_core-y += $(CXL_CORE_SRC)/hdm.o
cxl_core-y += $(CXL_CORE_SRC)/pmu.o
cxl_core-y += $(CXL_CORE_SRC)/cdat.o
+cxl_core-y += $(CXL_CORE_SRC)/ras.o
+cxl_core-y += $(CXL_CORE_SRC)/acpi.o
cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
+cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
cxl_core-y += config_check.o
cxl_core-y += cxl_core_test.o
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index cc8948f49117..1c3336095923 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -155,7 +155,7 @@ static struct {
} cfmws7;
struct {
struct acpi_cedt_cfmws cfmws;
- u32 target[4];
+ u32 target[3];
} cfmws8;
struct {
struct acpi_cedt_cxims cxims;
@@ -331,14 +331,14 @@ static struct {
.length = sizeof(mock_cedt.cfmws8),
},
.interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR,
- .interleave_ways = 2,
- .granularity = 0,
+ .interleave_ways = 8,
+ .granularity = 1,
.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
ACPI_CEDT_CFMWS_RESTRICT_PMEM,
.qtg_id = FAKE_QTG_ID,
- .window_size = SZ_256M * 16UL,
+ .window_size = SZ_512M * 6UL,
},
- .target = { 0, 1, 0, 1, },
+ .target = { 0, 1, 2, },
},
.cxims0 = {
.cxims = {
@@ -1000,25 +1000,21 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port)
find_cxl_root(port);
struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct access_coordinate ep_c[ACCESS_COORDINATE_MAX];
- struct range pmem_range = {
- .start = cxlds->pmem_res.start,
- .end = cxlds->pmem_res.end,
- };
- struct range ram_range = {
- .start = cxlds->ram_res.start,
- .end = cxlds->ram_res.end,
- };
if (!cxl_root)
return;
- if (range_len(&ram_range))
- dpa_perf_setup(port, &ram_range, &mds->ram_perf);
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct resource *res = &cxlds->part[i].res;
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+ struct range range = {
+ .start = res->start,
+ .end = res->end,
+ };
- if (range_len(&pmem_range))
- dpa_perf_setup(port, &pmem_range, &mds->pmem_perf);
+ dpa_perf_setup(port, &range, perf);
+ }
cxl_memdev_update_perf(cxlmd);
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 9495dbcc03a7..f2957a3e36fe 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -78,6 +78,10 @@ static struct cxl_cel_entry mock_cel[] = {
.effect = CXL_CMD_EFFECT_NONE,
},
{
+ .opcode = cpu_to_le16(CXL_MBOX_OP_SET_SHUTDOWN_STATE),
+ .effect = POLICY_CHANGE_IMMEDIATE,
+ },
+ {
.opcode = cpu_to_le16(CXL_MBOX_OP_GET_POISON),
.effect = CXL_CMD_EFFECT_NONE,
},
@@ -178,6 +182,7 @@ struct cxl_mockmem_data {
u64 timestamp;
unsigned long sanitize_timeout;
struct vendor_test_feat test_feat;
+ u8 shutdown_state;
};
static struct mock_event_log *event_find_log(struct device *dev, int log_type)
@@ -1105,6 +1110,21 @@ static int mock_health_info(struct cxl_mbox_cmd *cmd)
return 0;
}
+static int mock_set_shutdown_state(struct cxl_mockmem_data *mdata,
+ struct cxl_mbox_cmd *cmd)
+{
+ struct cxl_mbox_set_shutdown_state_in *ss = cmd->payload_in;
+
+ if (cmd->size_in != sizeof(*ss))
+ return -EINVAL;
+
+ if (cmd->size_out != 0)
+ return -EINVAL;
+
+ mdata->shutdown_state = ss->state;
+ return 0;
+}
+
static struct mock_poison {
struct cxl_dev_state *cxlds;
u64 dpa;
@@ -1583,6 +1603,9 @@ static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox,
case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE:
rc = mock_passphrase_secure_erase(mdata, cmd);
break;
+ case CXL_MBOX_OP_SET_SHUTDOWN_STATE:
+ rc = mock_set_shutdown_state(mdata, cmd);
+ break;
case CXL_MBOX_OP_GET_POISON:
rc = mock_get_poison(cxlds, cmd);
break;
@@ -1670,6 +1693,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
struct cxl_dev_state *cxlds;
struct cxl_mockmem_data *mdata;
struct cxl_mailbox *cxl_mbox;
+ struct cxl_dpa_info range_info = { 0 };
int rc;
mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL);
@@ -1709,7 +1733,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf;
INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work);
- cxlds->serial = pdev->id;
+ cxlds->serial = pdev->id + 1;
if (is_rcd(pdev))
cxlds->rcd = true;
@@ -1730,7 +1754,11 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
if (rc)
return rc;
- rc = cxl_mem_create_range_info(mds);
+ rc = cxl_mem_dpa_fetch(mds, &range_info);
+ if (rc)
+ return rc;
+
+ rc = cxl_dpa_setup(cxlds, &range_info);
if (rc)
return rc;